update example

578ffb6b · Chao Liu · 5816a647 · 578ffb6b · 578ffb6b · 578ffb6b
Commit 578ffb6b authored Jun 14, 2022 by Chao Liu
3 changed files
--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -27,28 +27,29 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using ADataType   = ck::half_t;
+using ADataType        = F16;
-using BDataType   = ck::half_t;
+using BDataType        = F16;
-using CDataType   = ck::half_t;
+using AccDataType      = F32;
-using AccDataType = float;
+using CShuffleDataType = F32;
+using CDataType        = F16;
-using ALayout = ck::tensor_layout::gemm::RowMajor;
+using ALayout = Row;
-using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using BLayout = Col;
-using CLayout = ck::tensor_layout::gemm::RowMajor;
+using CLayout = Row;
-using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+using AElementOp = PassThrough;
-using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp = PassThrough;
-using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp = PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 // clang-format off
 using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
-//######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######| ALayout| BLayout| ELayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
 //######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        <     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+        < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
 // clang-format on
 using ReferenceGemmInstance = ck::tensor_operation::host::
@@ -69,7 +70,11 @@ int main(int argc, char* argv[])
    ck::index_t StrideB = 4096;
    ck::index_t StrideC = 4096;
-    if(argc == 4)
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
    {
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
@@ -93,7 +98,7 @@ int main(int argc, char* argv[])
    {
        printf("arg1: verification (0=no, 1=yes)\n");
        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
        exit(0);
    }

--- a/example/03_gemm_bias_add_fastgelu/gemm_bias_add_fastgelu_xdl_fp16.cpp
+++ b/example/03_gemm_bias_add_fastgelu/gemm_bias_add_fastgelu_xdl_fp16.cpp
@@ -10,10 +10,10 @@
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
 #include "device_tensor.hpp"
-#include "device_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "element_wise_operation.hpp"
 #include "reference_gemm.hpp"
 #include "gemm_specialization.hpp"
+#include "device_gemm_multiple_d_xdl_cshuffle.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -26,8 +26,8 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-// E = FastGelu((A * B) + D0 + D1)
 // C = A * B
+// E = FastGelu(C + D0 + D1)
 struct AddAddFastGelu
 {
    __host__ __device__ void
@@ -69,7 +69,7 @@ using CDEElementOp = AddAddFastGelu;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 // clang-format off
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
 //######| ALayout| BLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
 //######|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
@@ -90,6 +90,7 @@ int main(int argc, char* argv[])
    ck::index_t StrideA  = 4096;
    ck::index_t StrideB  = 4096;
+    ck::index_t StrideD1 = 4096;
    ck::index_t StrideE  = 4096;
    if(argc == 1)
@@ -102,7 +103,7 @@ int main(int argc, char* argv[])
        init_method     = std::stoi(argv[2]);
        time_kernel     = std::stoi(argv[3]);
    }
-    else if(argc == 10)
+    else if(argc == 11)
    {
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
@@ -114,14 +115,15 @@ int main(int argc, char* argv[])
        StrideA  = std::stoi(argv[7]);
        StrideB  = std::stoi(argv[8]);
-        StrideE = std::stoi(argv[9]);
+        StrideD1 = std::stoi(argv[9]);
+        StrideE  = std::stoi(argv[10]);
    }
    else
    {
        printf("arg1: verification (0=no, 1=yes)\n");
        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
        printf("arg3: time kernel (0=no, 1=yes)\n");
-        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n");
+        printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD1, StrideE\n");
        exit(0);
    }
@@ -184,30 +186,28 @@ int main(int argc, char* argv[])
    auto cde_element_op = CDEElementOp{};
    // do GEMM
-    auto gemm    = DeviceGemmInstance{};
+    auto device_op = DeviceOpInstance{};
-    auto invoker = gemm.MakeInvoker();
+    auto invoker   = device_op.MakeInvoker();
    auto argument =
-        gemm.MakeArgument(a_m_k_device_buf.GetDeviceBuffer(),
+        device_op.MakeArgument(a_m_k_device_buf.GetDeviceBuffer(),
                               b_k_n_device_buf.GetDeviceBuffer(),
-                          std::array<const void*, 2>{{d0_m_n_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
-                                                      d1_m_n_device_buf.GetDeviceBuffer()}},
+                                                          d1_m_n_device_buf.GetDeviceBuffer()},
                               e_m_n_device_buf.GetDeviceBuffer(),
                               M,
                               N,
                               K,
                               StrideA,
                               StrideB,
-                          std::array<ck::index_t, 2>{{0, StrideE}},
+                               std::array<ck::index_t, 2>{0, StrideD1},
                               StrideE,
                               a_element_op,
                               b_element_op,
                               cde_element_op);
-    if(!gemm.IsSupportedArgument(argument))
+    if(!device_op.IsSupportedArgument(argument))
    {
-        throw std::runtime_error(
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
-            "wrong! device_gemm with the specified compilation parameters does "
-            "not support this GEMM problem");
    }
    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
@@ -222,7 +222,7 @@ int main(int argc, char* argv[])
    float gb_per_sec = num_btype / 1.E6 / ave_time;
    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << gemm.GetTypeString() << std::endl;
+              << device_op.GetTypeString() << std::endl;
    if(do_verification)
    {
@@ -237,6 +237,7 @@ int main(int argc, char* argv[])
                                                                                AElementOp,
                                                                                BElementOp,
                                                                                PassThrough>;
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();

--- a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
+++ b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
@@ -3,83 +3,103 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <stdlib.h>
-#include <half.hpp>
 #include "check_err.hpp"
 #include "config.hpp"
-#include "print.hpp"
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
-#include "host_gemm.hpp"
 #include "device_tensor.hpp"
 #include "element_wise_operation.hpp"
-#include "device_gemm_xdl_c_shuffle_bias_activation.hpp"
+#include "reference_gemm.hpp"
-#include "reference_gemm_bias_activation.hpp"
+#include "gemm_specialization.hpp"
+#include "device_gemm_multiple_d_xdl_cshuffle.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
-using ADataType   = ck::half_t;
+using F16 = ck::half_t;
-using BDataType   = ck::half_t;
+using F32 = float;
-using CDataType   = ck::half_t;
-using AccDataType = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
-using ALayout = ck::tensor_layout::gemm::RowMajor;
-using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using CLayout = ck::tensor_layout::gemm::RowMajor;
+// C = A * B
-using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+// E = Relu(C + D);
-using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+struct AddRelu
-using CElementOp = ck::tensor_operation::element_wise::AddRelu;
+{
+    __host__ __device__ void
-// clang-format off
+    operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& d) const
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle_Bias_Activation<
+    {
-    ADataType,              // ADataType
+        const ck::half_t x = c + d;
-    BDataType,              // BDataType
-    CDataType,              // CDataType
+        e = x > 0 ? x : 0;
-    AccDataType,            // AccDataType
+    }
-    ALayout,                // ALayout
+};
-    BLayout,                // BLayout
-    CLayout,                // CLayout
+using ADataType        = F16;
-    AElementOp,             // AElementwiseOperation
+using BDataType        = F16;
-    BElementOp,             // BElementwiseOperation
+using AccDataType      = F32;
-    CElementOp,             // CElementwiseOperation
+using CShuffleDataType = F16;
-    256,                    // BlockSize
+using DDataType        = F16;
-    256,                    // MPerBlock
+using DsDataType       = ck::Tuple<DDataType>;
-    128,                    // NPerBlock
+using EDataType        = F16;
-    4,                      // K0PerBlock
-    8,                      // K1
+using ALayout = Row;
-    32,                     // MPerXDL
+using BLayout = Col;
-    32,                     // NPerXDL
+using ELayout = Row;
-    4,                      // MXdlPerWave
-    2,                      // NXdlPerWave
+using AElementOp   = PassThrough;
-    S<4, 64, 1>,            // ABlockTransferThreadClusterLengths_K0_M_K1
+using BElementOp   = PassThrough;
-    S<1, 0, 2>,             // ABlockTransferThreadClusterArrangeOrder
+using CDEElementOp = AddRelu;
-    S<1, 0, 2>,             // ABlockTransferSrcAccessOrder
-    2,                      // ABlockTransferSrcVectorDim
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-    8,                      // ABlockTransferSrcScalarPerVector
-    8,                      // ABlockTransferDstScalarPerVector_K1
+using DeviceOpInstance =
-    true,                   // ABlockLdsAddExtraM
+    ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<ALayout,
-    S<4, 64, 1>,            // BBlockTransferThreadClusterLengths_K0_N_K1
+                                                                   BLayout,
-    S<1, 0, 2>,             // BBlockTransferThreadClusterArrangeOrder
+                                                                   ELayout,
-    S<1, 0, 2>,             // BBlockTransferSrcAccessOrder
+                                                                   ADataType,
-    2,                      // BBlockTransferSrcVectorDim
-    8,                      // BBlockTransferSrcScalarPerVector
-    8,                      // BBlockTransferDstScalarPerVector_K1
-    true,                   // BBlockLdsAddExtraN
-    1,                      // CShuffleMXdlPerWavePerShuffle
-    1,                      // CShuffleNXdlPerWavePerShuffle
-    S<1, 1, 32, 1, 1, 8>,   // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
-    8>;                     // CBlockTransferScalarPerVector_NWaveNPerXdl
-// clang-format on
-using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBiasActivation<ADataType,
                                                                   BDataType,
-                                                                                      CDataType,
+                                                                   AccDataType,
+                                                                   CShuffleDataType,
+                                                                   DsDataType,
+                                                                   EDataType,
                                                                   AElementOp,
                                                                   BElementOp,
-                                                                                      CElementOp>;
+                                                                   CDEElementOp,
+                                                                   GemmDefault,
+                                                                   1,
+                                                                   256,
+                                                                   256,
+                                                                   128,
+                                                                   32,
+                                                                   8,
+                                                                   8,
+                                                                   32,
+                                                                   32,
+                                                                   4,
+                                                                   2,
+                                                                   S<4, 64, 1>,
+                                                                   S<1, 0, 2>,
+                                                                   S<1, 0, 2>,
+                                                                   2,
+                                                                   8,
+                                                                   8,
+                                                                   1,
+                                                                   S<4, 64, 1>,
+                                                                   S<1, 0, 2>,
+                                                                   S<1, 0, 2>,
+                                                                   2,
+                                                                   8,
+                                                                   8,
+                                                                   1,
+                                                                   1,
+                                                                   1,
+                                                                   S<1, 32, 1, 8>,
+                                                                   8>;
 int main(int argc, char* argv[])
 {
@@ -94,9 +114,13 @@ int main(int argc, char* argv[])
    ck::index_t StrideA = 4096;
    ck::index_t StrideB = 4096;
-    ck::index_t StrideC = 4096;
+    ck::index_t StrideE = 4096;
-    if(argc == 4)
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
    {
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
@@ -114,14 +138,14 @@ int main(int argc, char* argv[])
        StrideA = std::stoi(argv[7]);
        StrideB = std::stoi(argv[8]);
-        StrideC = std::stoi(argv[9]);
+        StrideE = std::stoi(argv[9]);
    }
    else
    {
        printf("arg1: verification (0=no, 1=yes)\n");
        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
-        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n");
        exit(0);
    }
@@ -141,17 +165,14 @@ int main(int argc, char* argv[])
    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
-    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, 0, ELayout{}));
-    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
-    // c0_n[n]
-    Tensor<CDataType> c0_n(HostTensorDescriptor(
-        std::vector<std::size_t>({static_cast<std::size_t>(N)}), std::vector<std::size_t>({1})));
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
-    std::cout << "c0_n: " << c0_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
    switch(init_method)
    {
@@ -159,59 +180,59 @@ int main(int argc, char* argv[])
    case 1:
        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        c0_n.GenerateTensorValue(GeneratorTensor_2<CDataType>{-5, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
        break;
    default:
        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-        c0_n.GenerateTensorValue(GeneratorTensor_3<CDataType>{0.0, 1.0});
+        d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0});
    }
    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
-    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem d_m_n_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpace());
-    DeviceMem c0_n_device_buf(sizeof(CDataType) * c0_n.mDesc.GetElementSpace());
+    DeviceMem e_m_n_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpace());
    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
-    c_m_n_device_buf.ToDevice(c_m_n_device_result.mData.data());
+    d_m_n_device_buf.ToDevice(d_m_n.mData.data());
-    c0_n_device_buf.ToDevice(c0_n.mData.data());
    auto a_element_op   = AElementOp{};
    auto b_element_op   = BElementOp{};
-    auto c_element_op = CElementOp{};
+    auto cde_element_op = CDEElementOp{};
    // do GEMM
-    auto gemm = DeviceGemmInstance{};
+    auto device_op = DeviceOpInstance{};
+    auto invoker = device_op.MakeInvoker();
-    auto invoker  = gemm.MakeInvoker();
+    auto argument =
-    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+        device_op.MakeArgument(a_m_k_device_buf.GetDeviceBuffer(),
-                                      static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                               b_k_n_device_buf.GetDeviceBuffer(),
-                                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                               std::array<const void*, 1>{d_m_n_device_buf.GetDeviceBuffer()},
-                                      static_cast<CDataType*>(c0_n_device_buf.GetDeviceBuffer()),
+                               e_m_n_device_buf.GetDeviceBuffer(),
                               M,
                               N,
                               K,
                               StrideA,
                               StrideB,
-                                      StrideC,
+                               std::array<ck::index_t, 1>{0},
+                               StrideE,
                               a_element_op,
                               b_element_op,
-                                      c_element_op);
+                               cde_element_op);
-    if(!gemm.IsSupportedArgument(argument))
+    if(!device_op.IsSupportedArgument(argument))
    {
-        throw std::runtime_error(
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
-            "wrong! device_gemm with the specified compilation parameters does "
-            "not support this GEMM problem");
    }
    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
    std::size_t flop = std::size_t(2) * M * N * K;
-    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
-                            sizeof(CDataType) * M * N + sizeof(CDataType) * N;
+                            sizeof(EDataType) * M * N + sizeof(EDataType) * N;
    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
@@ -220,19 +241,37 @@ int main(int argc, char* argv[])
    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
              << std::endl;
-    c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
    if(do_verification)
    {
+        e_m_n_device_buf.FromDevice(e_m_n_device_result.mData.data());
+        Tensor<AccDataType> c_m_n(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();
-        auto ref_argument = ref_gemm.MakeArgument(
+        auto ref_argument =
-            a_m_k, b_k_n, c_m_n_host_result, c0_n, a_element_op, b_element_op, c_element_op);
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
        ref_invoker.Run(ref_argument);
-        return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
+            }
+        }
+        return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1;
    }
    return 0;