resolve merge conflicts

3ba485b6 · Jing Zhang · 04c1aa31 · a3c80265 · 3ba485b6 · 04c1aa31
Commit 3ba485b6 authored Oct 12, 2023 by Jing Zhang
20 changed files
--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
--- a/example/26_contraction/contraction_scale_xdl_fp32_compute_bf16.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32_compute_bf16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "common_instances.hpp"
-using ADataType        = F32;
-using BDataType        = F32;
-using AccDataType      = F32;
-using CShuffleDataType = F32;
-using DsDataType       = ck::Tuple<>;
-using EDataType        = F32;
-using ComputeDataType  = BF16;
-static constexpr ck::index_t NumDimM = 2;
-static constexpr ck::index_t NumDimN = 2;
-static constexpr ck::index_t NumDimK = 2;
-using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using CDEElementOp = ck::tensor_operation::element_wise::Scale;
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstance = DeviceOpInstanceKKN;
-#include "run_contraction_scale_example.inc"
-int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_fp32_compute_fp16.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32_compute_fp16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "common_instances.hpp"
-using ADataType        = F32;
-using BDataType        = F32;
-using AccDataType      = F32;
-using CShuffleDataType = F32;
-using DsDataType       = ck::Tuple<>;
-using EDataType        = F32;
-using ComputeDataType  = F16;
-static constexpr ck::index_t NumDimM = 2;
-static constexpr ck::index_t NumDimN = 2;
-static constexpr ck::index_t NumDimK = 2;
-using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using CDEElementOp = ck::tensor_operation::element_wise::Scale;
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstance = DeviceOpInstanceKKN;
-#include "run_contraction_scale_example.inc"
-int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64.cpp
--- a/example/26_contraction/contraction_scale_xdl_fp64_compute_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64_compute_fp32.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "common_instances.hpp"
-using ADataType        = F64;
-using BDataType        = F64;
-using AccDataType      = F32;
-using CShuffleDataType = F64;
-using DsDataType       = ck::Tuple<>;
-using EDataType        = F64;
-using ComputeDataType  = F32;
-static constexpr ck::index_t NumDimM = 2;
-static constexpr ck::index_t NumDimN = 2;
-static constexpr ck::index_t NumDimK = 2;
-using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using CDEElementOp = ck::tensor_operation::element_wise::Scale;
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_FP64<NumDimM,
-                                                    NumDimN,
-                                                    NumDimK,
-                                                    ADataType,
-                                                    BDataType,
-                                                    AccDataType,
-                                                    CShuffleDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    ComputeDataType,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>;
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_FP64<NumDimM,
-                                                    NumDimN,
-                                                    NumDimK,
-                                                    ADataType,
-                                                    BDataType,
-                                                    AccDataType,
-                                                    CShuffleDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    ComputeDataType,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>;
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_FP64<NumDimM,
-                                                    NumDimN,
-                                                    NumDimK,
-                                                    ADataType,
-                                                    BDataType,
-                                                    AccDataType,
-                                                    CShuffleDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    ComputeDataType,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>;
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_FP64<NumDimM,
-                                                    NumDimN,
-                                                    NumDimK,
-                                                    ADataType,
-                                                    BDataType,
-                                                    AccDataType,
-                                                    CShuffleDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    ComputeDataType,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>;
-using DeviceOpInstance = DeviceOpInstanceKKN;
-#include "run_contraction_scale_example.inc"
-int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/run_contraction_bilinear_example.inc
+++ b/example/26_contraction/run_contraction_bilinear_example.inc
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-#include <cstdlib>
-#include <iostream>
-#include <string>
-#include <vector>
-#include "ck/ck.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/numeric.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
-int run_contraction_bilinear_example(int argc, char* argv[])
-{
-    bool do_verification = true;
-    int init_method      = 1;
-    bool time_kernel     = false;
-    // A[M0, M1, K0, K1]
-    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
-    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
-    // B[N0, N1, K0, K1]
-    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
-    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
-    // D[M0, M1, N0, N1]
-    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
-    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
-    // E[M0, M1, N0, N1]
-    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
-    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
-    float alpha = 1.f;
-    float beta  = 1.f;
-    if(argc == 1)
-    {
-        // use default case
-    }
-    else if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-    }
-    else if(argc == 28)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-        const ck::index_t M0 = std::stoi(argv[4]);
-        const ck::index_t M1 = std::stoi(argv[5]);
-        const ck::index_t N0 = std::stoi(argv[6]);
-        const ck::index_t N1 = std::stoi(argv[7]);
-        const ck::index_t K0 = std::stoi(argv[8]);
-        const ck::index_t K1 = std::stoi(argv[9]);
-        a_ms_ks_lengths = {M0, M1, K0, K1};
-        a_ms_ks_strides = {
-            std::stoi(argv[10]), std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13])};
-        b_ns_ks_lengths = {N0, N1, K0, K1};
-        b_ns_ks_strides = {
-            std::stoi(argv[14]), std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17])};
-        d_ms_ns_lengths = {M0, M1, N0, N1};
-        d_ms_ns_strides = {
-            std::stoi(argv[18]), std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21])};
-        e_ms_ns_lengths = {M0, M1, N0, N1};
-        e_ms_ns_strides = {
-            std::stoi(argv[22]), std::stoi(argv[23]), std::stoi(argv[24]), std::stoi(argv[25])};
-        alpha = std::stof(argv[26]);
-        beta  = std::stof(argv[27]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=no, 1=yes)\n");
-        printf("arg4 to 9: M0, M1, N0, N1, K0, K1\n");
-        printf("arg10 to 13: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
-        printf("arg14 to 17: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
-        printf("arg18 to 21: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
-        printf("arg22 to 25: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
-        printf("arg26 to 27: alpha, beta\n");
-        exit(0);
-    }
-    Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
-    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
-    Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
-    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
-    std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
-    std::cout << "d_ms_ns: " << d_ms_ns.mDesc << std::endl;
-    std::cout << "e_ms_ns: " << e_ms_ns_host_result.mDesc << std::endl;
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        a_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-        b_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        d_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        break;
-    default:
-        a_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-        b_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-        d_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-        break;
-    }
-    DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpaceSize());
-    DeviceMem d_device_buf(sizeof(DDataType) * d_ms_ns.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpaceSize());
-    a_device_buf.ToDevice(a_ms_ks.mData.data());
-    b_device_buf.ToDevice(b_ns_ks.mData.data());
-    d_device_buf.ToDevice(d_ms_ns.mData.data());
-    // set zero
-    e_device_buf.SetZero();
-    auto a_element_op   = AElementOp{};
-    auto b_element_op   = BElementOp{};
-    auto cde_element_op = CDEElementOp{alpha, beta};
-    // device operation
-    auto op       = DeviceOpInstance{};
-    auto invoker  = op.MakeInvoker();
-    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
-                                    b_device_buf.GetDeviceBuffer(),
-                                    std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
-                                    e_device_buf.GetDeviceBuffer(),
-                                    a_ms_ks_lengths,
-                                    a_ms_ks_strides,
-                                    b_ns_ks_lengths,
-                                    b_ns_ks_strides,
-                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
-                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
-                                    e_ms_ns_lengths,
-                                    e_ms_ns_strides,
-                                    a_element_op,
-                                    b_element_op,
-                                    cde_element_op);
-    if(!op.IsSupportedArgument(argument))
-    {
-        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
-        return 0;
-    }
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    ck::index_t M =
-        ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
-    ck::index_t N = ck::accumulate_n<ck::index_t>(
-        e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
-    ck::index_t K = ck::accumulate_n<ck::index_t>(
-        a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
-    std::size_t flop      = std::size_t(2) * M * N * K;
-    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
-                            sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << op.GetTypeString() << std::endl;
-    e_device_buf.FromDevice(e_ms_ns_device_result.mData.data());
-    if(do_verification)
-    {
-        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-        using ReferenceOpInstance =
-            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
-                                                                      NumDimN,
-                                                                      NumDimK,
-                                                                      ADataType,
-                                                                      BDataType,
-                                                                      CShuffleDataType,
-                                                                      AccDataType,
-                                                                      ComputeDataType,
-                                                                      AElementOp,
-                                                                      BElementOp>;
-        auto ref_op      = ReferenceOpInstance{};
-        auto ref_invoker = ref_op.MakeInvoker();
-        auto ref_argument =
-            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);
-        ref_invoker.Run(ref_argument);
-        for(size_t m0 = 0; m0 < e_ms_ns_host_result.mDesc.GetLengths()[0]; ++m0)
-        {
-            for(size_t m1 = 0; m1 < e_ms_ns_host_result.mDesc.GetLengths()[1]; ++m1)
-            {
-                for(size_t n0 = 0; n0 < e_ms_ns_host_result.mDesc.GetLengths()[2]; ++n0)
-                {
-                    for(size_t n1 = 0; n1 < e_ms_ns_host_result.mDesc.GetLengths()[3]; ++n1)
-                    {
-                        cde_element_op(e_ms_ns_host_result(m0, m1, n0, n1),
-                                       c_ms_ns_host_result(m0, m1, n0, n1),
-                                       d_ms_ns(m0, m1, n0, n1));
-                    }
-                }
-            }
-        }
-        return ck::utils::check_err(e_ms_ns_device_result, e_ms_ns_host_result) ? 0 : 1;
-    }
-    return 0;
-}
--- a/example/26_contraction/run_contraction_scale_example.inc
+++ b/example/26_contraction/run_contraction_scale_example.inc
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-#include <cstdlib>
-#include <iostream>
-#include <string>
-#include <vector>
-#include "ck/ck.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/numeric.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
-int run_contraction_scale_example(int argc, char* argv[])
-{
-    bool do_verification = true;
-    int init_method      = 1;
-    bool time_kernel     = false;
-    // A[M0, M1, K0, K1]
-    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
-    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
-    // B[N0, N1, K0, K1]
-    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
-    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
-    // E[M0, M1, N0, N1]
-    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
-    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
-    float scale = 1.f;
-    if(argc == 1)
-    {
-        // use default case
-    }
-    else if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-    }
-    else if(argc == 23)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-        const ck::index_t M0 = std::stoi(argv[4]);
-        const ck::index_t M1 = std::stoi(argv[5]);
-        const ck::index_t N0 = std::stoi(argv[6]);
-        const ck::index_t N1 = std::stoi(argv[7]);
-        const ck::index_t K0 = std::stoi(argv[8]);
-        const ck::index_t K1 = std::stoi(argv[9]);
-        a_ms_ks_lengths = {M0, M1, K0, K1};
-        a_ms_ks_strides = {
-            std::stoi(argv[10]), std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13])};
-        b_ns_ks_lengths = {N0, N1, K0, K1};
-        b_ns_ks_strides = {
-            std::stoi(argv[14]), std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17])};
-        e_ms_ns_lengths = {M0, M1, N0, N1};
-        e_ms_ns_strides = {
-            std::stoi(argv[18]), std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21])};
-        scale = std::stof(argv[22]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=no, 1=yes)\n");
-        printf("arg4 to 9: M0, M1, N0, N1, K0, K1\n");
-        printf("arg10 to 13: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
-        printf("arg14 to 17: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
-        printf("arg18 to 21: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
-        printf("arg22: scale\n");
-        exit(0);
-    }
-    Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
-    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
-    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
-    std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
-    std::cout << "e_ms_ns: " << e_ms_ns_host_result.mDesc << std::endl;
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        a_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-        b_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        break;
-    default:
-        a_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-        b_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-        break;
-    }
-    DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpaceSize());
-    a_device_buf.ToDevice(a_ms_ks.mData.data());
-    b_device_buf.ToDevice(b_ns_ks.mData.data());
-    // set zero
-    e_device_buf.SetZero();
-    auto a_element_op   = AElementOp{};
-    auto b_element_op   = BElementOp{};
-    auto cde_element_op = CDEElementOp{scale};
-    // device operation
-    auto op       = DeviceOpInstance{};
-    auto invoker  = op.MakeInvoker();
-    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
-                                    b_device_buf.GetDeviceBuffer(),
-                                    std::array<const void*, 0>{},
-                                    e_device_buf.GetDeviceBuffer(),
-                                    a_ms_ks_lengths,
-                                    a_ms_ks_strides,
-                                    b_ns_ks_lengths,
-                                    b_ns_ks_strides,
-                                    std::array<std::vector<ck::index_t>, 0>{},
-                                    std::array<std::vector<ck::index_t>, 0>{},
-                                    e_ms_ns_lengths,
-                                    e_ms_ns_strides,
-                                    a_element_op,
-                                    b_element_op,
-                                    cde_element_op);
-    if(!op.IsSupportedArgument(argument))
-    {
-        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
-        return 0;
-    }
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    ck::index_t M =
-        ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
-    ck::index_t N = ck::accumulate_n<ck::index_t>(
-        e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
-    ck::index_t K = ck::accumulate_n<ck::index_t>(
-        a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
-    std::size_t flop = std::size_t(2) * M * N * K;
-    std::size_t num_btype =
-        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + +sizeof(EDataType) * M * N;
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << op.GetTypeString() << std::endl;
-    e_device_buf.FromDevice(e_ms_ns_device_result.mData.data());
-    if(do_verification)
-    {
-        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-        using ReferenceOpInstance =
-            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
-                                                                      NumDimN,
-                                                                      NumDimK,
-                                                                      ADataType,
-                                                                      BDataType,
-                                                                      CShuffleDataType,
-                                                                      AccDataType,
-                                                                      ComputeDataType,
-                                                                      AElementOp,
-                                                                      BElementOp>;
-        auto ref_op      = ReferenceOpInstance{};
-        auto ref_invoker = ref_op.MakeInvoker();
-        auto ref_argument =
-            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);
-        ref_invoker.Run(ref_argument);
-        for(size_t m0 = 0; m0 < e_ms_ns_host_result.mDesc.GetLengths()[0]; ++m0)
-        {
-            for(size_t m1 = 0; m1 < e_ms_ns_host_result.mDesc.GetLengths()[1]; ++m1)
-            {
-                for(size_t n0 = 0; n0 < e_ms_ns_host_result.mDesc.GetLengths()[2]; ++n0)
-                {
-                    for(size_t n1 = 0; n1 < e_ms_ns_host_result.mDesc.GetLengths()[3]; ++n1)
-                    {
-                        cde_element_op(e_ms_ns_host_result(m0, m1, n0, n1),
-                                       c_ms_ns_host_result(m0, m1, n0, n1));
-                    }
-                }
-            }
-        }
-        return ck::utils::check_err(e_ms_ns_device_result, e_ms_ns_host_result) ? 0 : 1;
-    }
-    return 0;
-}
--- a/example/60_gemm_multi_ABD/CMakeLists.txt
+++ b/example/60_gemm_multi_ABD/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-list(APPEND gpu_list2 gfx908 gfx90a gfx940 gfx941 gfx942)
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list2 AND target EQUAL 0)
-     add_example_executable(example_gemm_multi_ABD_xdl_fp16 gemm_multi_ABD_xdl_fp16.cpp)
-   set(target 1)
- endif()
-endforeach()
-endif()
--- a/example/61_contraction_multi_ABD/CMakeLists.txt
+++ b/example/61_contraction_multi_ABD/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-list(APPEND gpu_list2 gfx908 gfx90a gfx940 gfx941 gfx942)
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list2 AND target EQUAL 0)
-     add_example_executable(example_contraction_multi_ABD_xdl_fp16 contraction_multi_ABD_xdl_fp16.cpp)
-   set(target 1)
- endif()
-endforeach()
-endif()
--- a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
+++ b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/literals.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/numeric.hpp"
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-using F16 = ck::half_t;
-using F32 = float;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using A0DataType       = F16;
-using A1DataType       = F32;
-using BDataType        = F16;
-using AccDataType      = F32;
-using CShuffleDataType = F32;
-using DDataType        = F16;
-using EDataType        = F16;
-static constexpr ck::index_t NumDimM = 2;
-static constexpr ck::index_t NumDimN = 2;
-static constexpr ck::index_t NumDimK = 2;
-struct AlphaBetaAdd
-{
-    AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){};
-    template <typename E, typename C, typename D>
-    __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const;
-    template <>
-    __host__ __device__ constexpr void operator()<ck::half_t, float, ck::half_t>(
-        ck::half_t& e, const float& c, const ck::half_t& d) const
-    {
-        e = ck::type_convert<ck::half_t>(alpha_ * c + beta_ * ck::type_convert<float>(d));
-    };
-    float alpha_;
-    float beta_;
-};
-struct Multiply
-{
-    __host__ __device__ constexpr void
-    operator()(ck::half_t& a, const ck::half_t& a0, const float& a1) const
-    {
-        a = ck::type_convert<ck::half_t>(ck::type_convert<float>(a0) * a1);
-    }
-};
-using AElementOp   = Multiply;
-using BElementOp   = PassThrough;
-using CDEElementOp = AlphaBetaAdd;
-static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-using DeviceOpInstance = ck::tensor_operation::device::DeviceContractionMultipleABD_Xdl_CShuffle<
-    NumDimM,
-    NumDimN,
-    NumDimK,
-    ck::Tuple<A0DataType, A1DataType>,
-    ck::Tuple<BDataType>,
-    AccDataType,
-    CShuffleDataType,
-    ck::Tuple<DDataType>,
-    EDataType,
-    AElementOp,
-    BElementOp,
-    CDEElementOp,
-    GemmSpec,
-    1,
-    256,
-    256,
-    128,
-    32,
-    8,
-    8,
-    32,
-    32,
-    4,
-    2,
-    S<4, 64, 1>,
-    S<1, 0, 2>,
-    S<1, 0, 2>,
-    2,
-    8,
-    8,
-    1,
-    S<4, 64, 1>,
-    S<1, 0, 2>,
-    S<1, 0, 2>,
-    2,
-    8,
-    8,
-    1,
-    1,
-    1,
-    S<1, 32, 1, 8>,
-    8>;
-int main(int argc, char* argv[])
-{
-    bool do_verification = true;
-    int init_method      = 1;
-    bool time_kernel     = false;
-    float alpha = 1.0f;
-    float beta  = 1.0f;
-    // A0[M0, M1, K0, K1]
-    std::vector<ck::index_t> a0_ms_ks_lengths{30, 128, 32, 64};
-    std::vector<ck::index_t> a0_ms_ks_strides{128 * 32 * 64, 32 * 64, 64, 1};
-    // A1[M1, K1] -> A1[M0, M1, K0, K1]
-    std::vector<ck::index_t> a1_ms_ks_lengths{30, 128, 32, 64};
-    std::vector<ck::index_t> a1_ms_ks_strides{0, 64, 0, 1};
-    // B[N0, N1, K0, K1]
-    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
-    std::vector<ck::index_t> b_ns_ks_strides{64 * 32 * 64, 32 * 64, 64, 1};
-    // D[M0, M1, N0, N1]
-    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
-    std::vector<ck::index_t> d_ms_ns_strides{128 * 32 * 64, 32 * 64, 64, 1};
-    // E[M0, M1, N0, N1]
-    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
-    std::vector<ck::index_t> e_ms_ns_strides{128 * 32 * 64, 32 * 64, 64, 1};
-    if(argc == 1)
-    {
-        // use default case
-    }
-    else if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=no, 1=yes)\n");
-        exit(0);
-    }
-    Tensor<A0DataType> a0_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides);
-    Tensor<A1DataType> a1_ms_ks(a1_ms_ks_lengths, a1_ms_ks_strides);
-    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
-    Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
-    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
-    std::cout << "a0_ms_ks: " << a0_ms_ks.mDesc << std::endl;
-    std::cout << "a1_ms_ks: " << a1_ms_ks.mDesc << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
-    std::cout << "d_ms_ns: " << d_ms_ns.mDesc << std::endl;
-    std::cout << "e_ms_ns: " << e_ms_ns_host_result.mDesc << std::endl;
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        a0_ms_ks.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-5, 5});
-        a1_ms_ks.GenerateTensorValue(GeneratorTensor_2<A1DataType>{-5, 5});
-        b_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        d_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        break;
-    default:
-        a0_ms_ks.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
-        a1_ms_ks.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0.0, 1.0});
-        b_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-        d_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-        break;
-    }
-    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_ms_ks.mDesc.GetElementSpaceSize());
-    DeviceMem a1_device_buf(sizeof(A1DataType) * a1_ms_ks.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpaceSize());
-    DeviceMem d_device_buf(sizeof(DDataType) * d_ms_ns.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpaceSize());
-    a0_device_buf.ToDevice(a0_ms_ks.mData.data());
-    a1_device_buf.ToDevice(a1_ms_ks.mData.data());
-    b_device_buf.ToDevice(b_ns_ks.mData.data());
-    d_device_buf.ToDevice(d_ms_ns.mData.data());
-    // set zero
-    e_device_buf.SetZero();
-    auto a_element_op   = AElementOp{};
-    auto b_element_op   = BElementOp{};
-    auto cde_element_op = CDEElementOp{alpha, beta};
-    // do GEMM
-    auto device_op = DeviceOpInstance{};
-    auto invoker   = device_op.MakeInvoker();
-    auto argument  = device_op.MakeArgument(
-        std::array<const void*, 2>{a0_device_buf.GetDeviceBuffer(),
-                                   a1_device_buf.GetDeviceBuffer()},
-        std::array<const void*, 1>{b_device_buf.GetDeviceBuffer()},
-        std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
-        e_device_buf.GetDeviceBuffer(),
-        std::array<std::vector<ck::index_t>, 2>{a0_ms_ks_lengths, a1_ms_ks_lengths},
-        std::array<std::vector<ck::index_t>, 2>{a0_ms_ks_strides, a1_ms_ks_strides},
-        std::array<std::vector<ck::index_t>, 1>{b_ns_ks_lengths},
-        std::array<std::vector<ck::index_t>, 1>{b_ns_ks_strides},
-        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
-        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
-        e_ms_ns_lengths,
-        e_ms_ns_strides,
-        a_element_op,
-        b_element_op,
-        cde_element_op);
-    if(!device_op.IsSupportedArgument(argument))
-    {
-        throw std::runtime_error(
-            "wrong! device_contraction with the specified compilation parameters does "
-            "not support this problem");
-    }
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    if(time_kernel)
-    {
-        ck::index_t M =
-            ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
-        ck::index_t N = ck::accumulate_n<ck::index_t>(
-            e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
-        ck::index_t K = ck::accumulate_n<ck::index_t>(
-            a0_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
-        std::size_t flop = std::size_t(2) * M * N * K;
-        std::size_t num_btype =
-            sizeof(A0DataType) * M * K + sizeof(BDataType) * K * N + +sizeof(EDataType) * M * N;
-        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-        float gb_per_sec = num_btype / 1.E6 / ave_time;
-        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                  << " GB/s" << std::endl;
-    }
-    if(do_verification)
-    {
-        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-        Tensor<A0DataType> a_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides);
-        for(size_t m0 = 0; m0 < a_ms_ks.mDesc.GetLengths()[0]; ++m0)
-        {
-            for(size_t m1 = 0; m1 < a_ms_ks.mDesc.GetLengths()[1]; ++m1)
-            {
-                for(size_t k0 = 0; k0 < a_ms_ks.mDesc.GetLengths()[2]; ++k0)
-                {
-                    for(size_t k1 = 0; k1 < a_ms_ks.mDesc.GetLengths()[3]; ++k1)
-                    {
-                        a_element_op(a_ms_ks(m0, m1, k0, k1),
-                                     a0_ms_ks(m0, m1, k0, k1),
-                                     a1_ms_ks(m0, m1, k0, k1));
-                    }
-                }
-            }
-        }
-        using ReferenceOpInstance =
-            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
-                                                                      NumDimN,
-                                                                      NumDimK,
-                                                                      A0DataType,
-                                                                      BDataType,
-                                                                      CShuffleDataType,
-                                                                      AccDataType,
-                                                                      PassThrough,
-                                                                      BElementOp>;
-        auto ref_op      = ReferenceOpInstance{};
-        auto ref_invoker = ref_op.MakeInvoker();
-        Tensor<float> empty_tensor(std::vector<ck::index_t>{}, std::vector<ck::index_t>{});
-        auto ref_argument =
-            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, PassThrough{}, b_element_op);
-        ref_invoker.Run(ref_argument);
-        for(size_t m0 = 0; m0 < e_ms_ns_host_result.mDesc.GetLengths()[0]; ++m0)
-        {
-            for(size_t m1 = 0; m1 < e_ms_ns_host_result.mDesc.GetLengths()[1]; ++m1)
-            {
-                for(size_t n0 = 0; n0 < e_ms_ns_host_result.mDesc.GetLengths()[2]; ++n0)
-                {
-                    for(size_t n1 = 0; n1 < e_ms_ns_host_result.mDesc.GetLengths()[3]; ++n1)
-                    {
-                        cde_element_op(e_ms_ns_host_result(m0, m1, n0, n1),
-                                       c_ms_ns_host_result(m0, m1, n0, n1),
-                                       d_ms_ns(m0, m1, n0, n1));
-                    }
-                }
-            }
-        }
-        e_device_buf.FromDevice(e_ms_ns_device_result.mData.data());
-        return ck::utils::check_err(e_ms_ns_device_result, e_ms_ns_host_result) ? 0 : 1;
-    }
-    return 0;
-}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -30,7 +30,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
                set(test 0)
                break()
            elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR
-                source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND 
+                source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND
                NOT(source MATCHES type OR source MATCHES type1))
                    #if filename contains a type which doesn't match any selected type, mark it for removal
                    set(test 1)
@@ -59,7 +59,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
        set(result 0)
    endif()
    #message("add_example returns ${result}")
-    return(PROPAGATE result)
+    set(result ${result} PARENT_SCOPE)
 endfunction(add_example_executable EXAMPLE_NAME)
 function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
@@ -87,7 +87,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
                    set(test 0)
                    break()
                elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR
-                  source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND 
+                  source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND
                  NOT(source MATCHES type OR source MATCHES type1))
                    #if filename contains a type which doesn't match any selected type, mark it for removal
                    set(test 1)
@@ -96,7 +96,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
        if(test EQUAL 1)
            message("removing example ${source} ")
            list(REMOVE_ITEM FILE_NAME "${source}")
-        endif()    
+        endif()
    endforeach()
    endif()
    foreach(source IN LISTS FILE_NAME)
@@ -114,7 +114,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
        set(result 0)
    endif()
    #message("add_example returns ${result}")
-    return(PROPAGATE result)
+    set(result ${result} PARENT_SCOPE)
 endfunction(add_example_executable_no_testing EXAMPLE_NAME)
 # add all example subdir

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -28,7 +28,8 @@ MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K(const TileDesc_K0_MN_K1&)
 }
 template <index_t BlockSize,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatAcc,
          typename AK0MK1BlockDesc,
          typename BK0NK1BlockDesc,
@@ -58,7 +59,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2);
    static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2);
-    static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack>{};
+    static constexpr auto xdlops_gemm = XdlopsGemm<FloatA, MPerXDL, NPerXDL, KPack, FloatB>{};
    static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;
@@ -294,9 +295,9 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                        const BBlockBuffer& b_block_buf,
                        CThreadBuffer& c_thread_buf) const
    {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
            a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
            b_thread_desc_.GetElementSpaceSize());
        static_for<0, MRepeat, 1>{}([&](auto m0) {
@@ -318,25 +319,27 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                   b_thread_buf);
                static_for<0, KPerThread, KPack>{}([&](auto k) {
-                    vector_type<FloatAB, KPack> a_thread_vec;
+                    vector_type<FloatA, KPack> a_thread_vec;
-                    vector_type<FloatAB, KPack> b_thread_vec;
+                    vector_type<FloatB, KPack> b_thread_vec;
                    static_for<0, KPack, 1>{}([&](auto i) {
-                        a_thread_vec.template AsType<FloatAB>()(i) = a_thread_buf
+                        a_thread_vec.template AsType<FloatA>()(i) = a_thread_buf
                            [Number<a_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, k + i))>{}];
-                        b_thread_vec.template AsType<FloatAB>()(i) = b_thread_buf
+                        b_thread_vec.template AsType<FloatB>()(i) = b_thread_buf
                            [Number<b_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, k + i))>{}];
                    });
-                    using mfma_input_type =
+                    using mfma_input_type_a =
-                        typename vector_type<FloatAB, xdlops_gemm.K1PerXdlops>::type;
+                        typename vector_type<FloatA, xdlops_gemm.K1PerXdlops>::type;
+                    using mfma_input_type_b =
+                        typename vector_type<FloatB, xdlops_gemm.K1PerXdlops>::type;
                    constexpr index_t c_offset =
                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
                    xdlops_gemm.template Run(
-                        a_thread_vec.template AsType<mfma_input_type>(),
+                        a_thread_vec.template AsType<mfma_input_type_a>(),
-                        b_thread_vec.template AsType<mfma_input_type>(),
+                        b_thread_vec.template AsType<mfma_input_type_b>(),
                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                });
            });
@@ -356,8 +359,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));
-    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
-                                                         FloatAB,
+                                                         FloatA,
                                                         decltype(a_block_desc_m0_m1_m2_k),
                                                         decltype(a_thread_desc_),
                                                         Sequence<1, 1, 1, KPerThread>,
@@ -366,8 +369,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                         A_K1,
                                                         A_K1>;
-    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
-                                                         FloatAB,
+                                                         FloatB,
                                                         decltype(b_block_desc_n0_n1_n2_k),
                                                         decltype(b_thread_desc_),
                                                         Sequence<1, 1, 1, KPerThread>,
@@ -385,7 +388,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 // the latest ROCm release. For unsupported compilers, inter-wave loop scheduler falls back to the
 // default loop scheduler which is given by the macro CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING=0
 template <index_t BlockSize,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatAcc,
          typename AK0MK1BlockDesc,
          typename BK0NK1BlockDesc,
@@ -397,7 +401,8 @@ template <index_t BlockSize,
          index_t NumMacClusters = CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS>
 struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    : public BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                 FloatAB,
+                                                                 FloatA,
+                                                                 FloatB,
                                                                 FloatAcc,
                                                                 AK0MK1BlockDesc,
                                                                 BK0NK1BlockDesc,
@@ -408,7 +413,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                                 KPack>
 {
    using Base = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                     FloatAB,
+                                                                     FloatA,
+                                                                     FloatB,
                                                                     FloatAcc,
                                                                     AK0MK1BlockDesc,
                                                                     BK0NK1BlockDesc,
@@ -440,9 +446,9 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                        const BBlockBuffer& b_block_buf,
                        CThreadBuffer& c_thread_buf) const
    {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
            a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
            b_thread_desc_.GetElementSpaceSize());
        static_for<0, KPerThread, KPerInnerLoop>{}([&](auto k) {
@@ -479,20 +485,22 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
            static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                static_for<0, MRepeat, 1>{}([&](auto m0) {
                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<FloatAB, KPack> a_thread_vec;
+                        vector_type<FloatA, KPack> a_thread_vec;
-                        vector_type<FloatAB, KPack> b_thread_vec;
+                        vector_type<FloatB, KPack> b_thread_vec;
                        static_for<0, KPack, 1>{}([&](auto i) {
-                            a_thread_vec.template AsType<FloatAB>()(i) =
+                            a_thread_vec.template AsType<FloatA>()(i) =
                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                    make_tuple(m0, 0, 0, k_ + i))>{}];
-                            b_thread_vec.template AsType<FloatAB>()(i) =
+                            b_thread_vec.template AsType<FloatB>()(i) =
                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                    make_tuple(n0, 0, 0, k_ + i))>{}];
                        });
-                        using mfma_input_type =
+                        using mfma_input_type_a =
-                            typename vector_type<FloatAB, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<FloatA, xdlops_gemm.K1PerXdlops>::type;
+                        using mfma_input_type_b =
+                            typename vector_type<FloatB, xdlops_gemm.K1PerXdlops>::type;
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -514,8 +522,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                        // TODO: insert setprio in more precise manner since we
                        // could have more than >1 MFMA instructions in single call
                        xdlops_gemm.template Run(
-                            a_thread_vec.template AsType<mfma_input_type>(),
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                        if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
                        {
@@ -541,8 +549,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
        make_tuple(Number<NRepeat>{}, I1, I1, Number<KPerInnerLoop>{}));
-    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
-                                                         FloatAB,
+                                                         FloatA,
                                                         decltype(a_block_desc_m0_m1_m2_k),
                                                         decltype(a_thread_desc_),
                                                         Sequence<1, 1, 1, KPerInnerLoop>,
@@ -551,8 +559,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                         A_K1,
                                                         A_K1>;
-    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
-                                                         FloatAB,
+                                                         FloatB,
                                                         decltype(b_block_desc_n0_n1_n2_k),
                                                         decltype(b_thread_desc_),
                                                         Sequence<1, 1, 1, KPerInnerLoop>,
@@ -568,7 +576,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 };
 template <index_t BlockSize,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatAcc,
          typename AK0MK1BlockDesc,
          typename BK0NK1BlockDesc,
@@ -583,7 +592,8 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
    if constexpr(LoopSched == LoopScheduler::Default)
    {
        return BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                   FloatAB,
+                                                                   FloatA,
+                                                                   FloatB,
                                                                   FloatAcc,
                                                                   AK0MK1BlockDesc,
                                                                   BK0NK1BlockDesc,
@@ -596,7 +606,8 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
    else if constexpr(LoopSched == LoopScheduler::Interwave)
    {
        return BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                            FloatAB,
+                                                                            FloatA,
+                                                                            FloatB,
                                                                            FloatAcc,
                                                                            AK0MK1BlockDesc,
                                                                            BK0NK1BlockDesc,
@@ -618,26 +629,27 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
 * 3. configurable k index starting position and step size after each FMA/XDL instruction
 */
-template <index_t BlockSize,
+template <
-          typename FloatAB,
+    index_t BlockSize,
-          typename FloatAcc,
+    typename FloatAB,
-          typename ATileDesc,
+    typename FloatAcc,
-          typename BTileDesc,
+    typename ATileDesc,
-          typename AMmaTileDesc,
+    typename BTileDesc,
-          typename BMmaTileDesc,
+    typename AMmaTileDesc,
-          index_t MPerBlock,
+    typename BMmaTileDesc,
-          index_t NPerBlock,
+    index_t MPerBlock,
-          index_t KPerBlock,
+    index_t NPerBlock,
-          index_t MPerXDL,
+    index_t KPerBlock,
-          index_t NPerXDL,
+    index_t MPerXDL,
-          index_t MRepeat,
+    index_t NPerXDL,
-          index_t NRepeat,
+    index_t MRepeat,
-          index_t KPack,
+    index_t NRepeat,
-          bool TransposeC = false,
+    index_t KPack,
-          index_t AMmaKStride =
+    bool TransposeC = false,
-              KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, TransposeC>{}.K0PerXdlops,
+    index_t AMmaKStride =
-          index_t BMmaKStride =
+        KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops,
-              KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, TransposeC>{}.K0PerXdlops>
+    index_t BMmaKStride =
+        KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
 struct BlockwiseGemmXdlops_v2
 {
    static constexpr auto I0 = Number<0>{};
@@ -654,7 +666,8 @@ struct BlockwiseGemmXdlops_v2
    static constexpr index_t A_K1 = ATileDesc{}.GetLength(I2);
    static constexpr index_t B_K1 = BTileDesc{}.GetLength(I2);
-    static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, TransposeC>{};
+    static constexpr auto xdlops_gemm =
+        XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{};
    static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;

--- a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-#include <array>
-#include "ck/tensor_operation/gpu/device/device_base.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-// GEMM:
-//   input : A0[M0, M1, ... K0, K1, ...], ...
-//   input : B0[N0, N1, ... K0, K1, ...], ...
-//   input : D0[M0, M1, ... N0, N1, ...], D1[M0, M1, ... N0, N1, ...], ...
-//   output : E[M0, M1, ... N0, N1, ...]
-//   C = a_op(A) * b_op(B)
-//   E = cde_op(C, D0, D1, ...)
-// Assume:
-//   D0, D1, ... and E have the same layout
-template <index_t NumDimM,
-          index_t NumDimN,
-          index_t NumDimK,
-          typename AsDataType,
-          typename BsDataType,
-          typename DsDataType,
-          typename EDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation>
-struct DeviceContractionMultipleABD : public BaseOperator
-{
-    static constexpr index_t NumATensor = AsDataType::Size();
-    static constexpr index_t NumBTensor = BsDataType::Size();
-    static constexpr index_t NumDTensor = DsDataType::Size();
-    virtual std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(std::array<const void*, NumATensor> p_as,
-                        std::array<const void*, NumBTensor> p_bs,
-                        std::array<const void*, NumDTensor> p_ds,
-                        void* p_e,
-                        const std::array<std::vector<index_t>, NumATensor>& a_ms_ks_lengths,
-                        const std::array<std::vector<index_t>, NumATensor>& a_ms_ks_strides,
-                        const std::array<std::vector<index_t>, NumBTensor>& b_ns_ks_lengths,
-                        const std::array<std::vector<index_t>, NumBTensor>& b_ns_ks_strides,
-                        const std::array<std::vector<index_t>, NumDTensor>& d_ms_ns_lengths,
-                        const std::array<std::vector<index_t>, NumDTensor>& d_ms_ns_strides,
-                        const std::vector<index_t>& e_ms_ns_length,
-                        const std::vector<index_t>& e_ms_ns_stride,
-                        AElementwiseOperation a_element_op,
-                        BElementwiseOperation b_element_op,
-                        CDEElementwiseOperation cde_element_op) = 0;
-    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
-};
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
@@ -33,8 +33,7 @@ template <index_t NumDimM,
          typename EDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
+          typename CDEElementwiseOperation>
-          typename ComputeDataType = ADataType>
 struct DeviceContractionMultipleD : public BaseOperator
 {
    static constexpr index_t NumDTensor = DsDataType::Size();

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
@@ -29,7 +29,9 @@ template <ck::index_t NDimSpatial,
          typename EDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CDEElementwiseOperation>
+          typename CDEElementwiseOperation,
+          typename AComputeType = ADataType,
+          typename BComputeType = AComputeType>
 struct DeviceGroupedConvBwdDataMultipleD : public BaseOperator
 {
    static constexpr index_t NumDTensor = DsDataType::Size();

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
@@ -20,7 +20,9 @@ template <ck::index_t NDimSpatial,
          typename OutDataType,
          typename InElementwiseOperation,
          typename WeiElementwiseOperation,
-          typename OutElementwiseOperation>
+          typename OutElementwiseOperation,
+          typename ComputeTypeA = InDataType,
+          typename ComputeTypeB = ComputeTypeA>
 struct DeviceGroupedConvBwdWeight : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument>

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
@@ -29,7 +29,8 @@ template <index_t NDimSpatial,
          typename EDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CDEElementwiseOperation>
+          typename CDEElementwiseOperation,
+          typename ComputeType = ADataType>
 struct DeviceGroupedConvFwdMultipleD : public BaseOperator
 {
    static constexpr index_t NumDTensor = DsDataType::Size();

--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -112,7 +112,6 @@ template <index_t NumDimM,
          typename CShuffleDataType,
          typename DsDataType,
          typename EDataType,
-          typename ComputeDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CDEElementwiseOperation,
@@ -157,8 +156,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                                        EDataType,
                                        AElementwiseOperation,
                                        BElementwiseOperation,
-                                        CDEElementwiseOperation,
+                                        CDEElementwiseOperation>
-                                        ComputeDataType>
 {
    using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle;
@@ -312,6 +310,8 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({{}}, {{}}))>;
    using EGridDesc_M_N  = decltype(MakeEGridDescriptor_M_N({}, {}));
+    using ComputeDataType = ADataType;
    // GridwiseGemm
    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
        ADataType, // TODO: distinguish A/B datatype

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
@@ -66,7 +66,8 @@ template <typename ALayout,
          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
          LoopScheduler LoopSched     = make_default_loop_scheduler(),
          PipelineVersion PipelineVer = PipelineVersion::v1,
-          typename ComputeType        = CDataType>
+          typename ComputeTypeA       = CDataType,
+          typename ComputeTypeB       = ComputeTypeA>
 struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
                                                   BLayout,
                                                   CLayout,
@@ -131,7 +132,8 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
        CShuffleBlockTransferScalarPerVector_NPerBlock,
        LoopSched,
        PipelineVer,
-        ComputeType>;
+        ComputeTypeA,
+        ComputeTypeB>;
    using Argument = typename GridwiseGemm::Argument;