Merge remote-tracking branch 'origin/develop' into stream-k-initial-impl

2f463a94 · carlushuang · ca8b5c79 · ac9e01e2 · 2f463a94 · 2f463a94
Commit 2f463a94 authored May 25, 2023 by carlushuang
20 changed files
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
-add_custom_target(example_convnd_fwd_reduce_xdl)
-
-add_example_executable(example_convnd_fwd_max_xdl_int8 convnd_fwd_max_xdl_int8.cpp)
-add_example_executable_no_testing(example_convnd_fwd_max_xdl_bf16 convnd_fwd_max_xdl_bf16.cpp)
-add_example_executable_no_testing(example_convnd_fwd_max_xdl_fp16 convnd_fwd_max_xdl_fp16.cpp)
-add_example_executable(example_convnd_fwd_max_xdl_fp32 convnd_fwd_max_xdl_fp32.cpp)
-
-add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int8)
-add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_bf16)
-add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp16)
-add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp32)
-
-if(USE_BITINT_EXTENSION_INT4)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_custom_target(example_convnd_fwd_reduce_xdl)
+   add_example_executable(example_convnd_fwd_max_xdl_int8 convnd_fwd_max_xdl_int8.cpp)
+   add_example_executable_no_testing(example_convnd_fwd_max_xdl_bf16 convnd_fwd_max_xdl_bf16.cpp)
+   add_example_executable_no_testing(example_convnd_fwd_max_xdl_fp16 convnd_fwd_max_xdl_fp16.cpp)
+   add_example_executable(example_convnd_fwd_max_xdl_fp32 convnd_fwd_max_xdl_fp32.cpp)
+   add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int8)
+   add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_bf16)
+   add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp16)
+   add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp32)
+   if(USE_BITINT_EXTENSION_INT4)
      add_example_executable(example_convnd_fwd_max_xdl_int4 convnd_fwd_max_xdl_int4.cpp)
      add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int4)
-endif(USE_BITINT_EXTENSION_INT4)
+   endif(USE_BITINT_EXTENSION_INT4)
+endif()
\ No newline at end of file
--- a/example/13_pool2d_fwd/pool2d_fwd_common.hpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
@@ -17,115 +17,11 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"

 template <typename InDataType,
          typename OutDataType,
-          typename AccDataType,
-          typename IndexDataType,
-          ck::ReduceTensorOp ReduceOpId,
-          bool PropagateNan,
-          bool OutputIndex>
-static void pool_host_verify(const Tensor<InDataType>& in,
-                             Tensor<OutDataType>& out,
-                             Tensor<IndexDataType>& out_indices,
-                             const std::array<ck::index_t, 2>& window_spatial_lengths,
-                             const std::array<ck::index_t, 2>& window_strides,
-                             const std::array<ck::index_t, 2>& in_left_pads,
-                             const std::array<ck::index_t, 2>& /*in_right_pads*/)
-{
-    const int32_t reduceLength = window_spatial_lengths[0] * window_spatial_lengths[1];
-
-    using ReduceOperation = typename ck::reduce_binary_operator<ReduceOpId>::opType;
-
-    auto elementwise_ops =
-        ck::reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(reduceLength);
-
-    auto in_elementwise_op  = std::get<0>(elementwise_ops);
-    auto acc_elementwise_op = std::get<1>(elementwise_ops);
-
-    if constexpr(!OutputIndex)
-    {
-        using Accumulation =
-            ck::detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
-
-        auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
-            auto accuVal = ReduceOperation::template GetIdentityValue<AccDataType>();
-
-            for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y)
-            {
-                ck::index_t hi = ho * window_strides[0] + y - in_left_pads[0];
-                for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x)
-                {
-                    ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1];
-                    if(hi >= 0 && hi < static_cast<ck::index_t>(in.mDesc.GetLengths()[2]) &&
-                       wi >= 0 && wi < static_cast<ck::index_t>(in.mDesc.GetLengths()[3]))
-                    {
-                        AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
-
-                        in_elementwise_op(currVal, currVal);
-
-                        Accumulation::Calculate(accuVal, currVal);
-                    }
-                }
-            }
-
-            acc_elementwise_op(accuVal, accuVal);
-
-            out(n, c, ho, wo) = accuVal;
-        };
-
-        make_ParallelTensorFunctor(f_nchw,
-                                   out.mDesc.GetLengths()[0],
-                                   out.mDesc.GetLengths()[1],
-                                   out.mDesc.GetLengths()[2],
-                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-    }
-    else
-    {
-        using Accumulation = ck::detail::AccumulateWithIndexAndNanCheck<PropagateNan,
-                                                                        ReduceOperation,
-                                                                        AccDataType,
-                                                                        IndexDataType>;
-        auto f_nchw        = [&](auto n, auto c, auto ho, auto wo) {
-            auto accuVal            = ReduceOperation::template GetIdentityValue<AccDataType>();
-            IndexDataType accuIndex = 0;
-
-            for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y)
-            {
-                ck::index_t hi = ho * window_strides[0] + y - in_left_pads[0];
-                for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x)
-                {
-                    ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[3])
-                    {
-                        AccDataType currVal     = static_cast<AccDataType>(in(n, c, hi, wi));
-                        IndexDataType currIndex = y * window_spatial_lengths[1] + x;
-
-                        in_elementwise_op(currVal, currVal);
-
-                        Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
-                    }
-                }
-            }
-
-            acc_elementwise_op(accuVal, accuVal);
-
-            out(n, c, ho, wo)         = accuVal;
-            out_indices(n, c, ho, wo) = accuIndex;
-        };
-
-        make_ParallelTensorFunctor(f_nchw,
-                                   out.mDesc.GetLengths()[0],
-                                   out.mDesc.GetLengths()[1],
-                                   out.mDesc.GetLengths()[2],
-                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-    };
-}
-
-template <typename InDataType,
-          typename OutDataType,
-          typename AccDataType,
+          typename ComputeDataType,
          typename IndexDataType,
          typename InLayout,
          typename OutLayout,
@@ -152,7 +48,8 @@ bool pool_test(bool do_verification,
        ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
            InDataType,      // InDataType
            OutDataType,     // OutDataType
-            AccDataType, // AccDataType
+            IndexDataType,   // IndexDataType
+            ComputeDataType, // ComputeDataType
            ReduceOpId,
            OutputIndex,
            64, // BlockSize
@@ -165,10 +62,10 @@ bool pool_test(bool do_verification,
    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;

-    const std::array<ck::index_t, 2> window_spatial_lengths{{Y, X}};
-    const std::array<ck::index_t, 2> window_strides{{window_stride_h, window_stride_w}};
-    const std::array<ck::index_t, 2> input_left_pads{{in_left_pad_h, in_left_pad_w}};
-    const std::array<ck::index_t, 2> input_right_pads{{in_right_pad_h, in_right_pad_w}};
+    const std::vector<ck::index_t> window_spatial_lengths{Y, X};
+    const std::vector<ck::index_t> window_strides{window_stride_h, window_stride_w};
+    const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
+    const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};

    // tensor layout
    auto f_host_tensor_descriptor =
@@ -219,14 +116,16 @@ bool pool_test(bool do_verification,
        static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
        static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
-        N,
-        C,
-        std::array<ck::index_t, 2>{{Hi, Wi}},
-        std::array<ck::index_t, 2>{{Y, X}},
-        std::array<ck::index_t, 2>{{Ho, Wo}},
+        {N, C, Hi, Wi},
+        {Y, X},
+        {N, C, Ho, Wo},
+        {C * Hi * Wi, 1, Wi * C, C},
+        {C * Ho * Wo, 1, Wo * C, C},
+        {C * Ho * Wo, 1, Wo * C, C},
        window_strides,
        input_left_pads,
-        input_right_pads);
+        input_right_pads,
+        {2, 3});

    if(!pool.IsSupportedArgument(argument_ptr.get()))
    {
@@ -252,13 +151,20 @@ bool pool_test(bool do_verification,

    if(do_verification)
    {
-        pool_host_verify<InDataType,
+        using ReferencePoolingFwdInstance =
+            ck::tensor_operation::host::ReferencePoolingFwd<4,
+                                                            2,
+                                                            InDataType,
                                                            OutDataType,
-                         AccDataType,
+                                                            ComputeDataType,
                                                            IndexDataType,
                                                            ReduceOpId,
                                                            PropagateNan,
-                         OutputIndex>(in_n_c_hi_wi,
+                                                            OutputIndex>;
+
+        auto ref_pooling          = ReferencePoolingFwdInstance{};
+        auto ref_pooling_invoker  = ref_pooling.MakeInvoker();
+        auto ref_pooling_argument = ref_pooling.MakeArgument(in_n_c_hi_wi,
                                                             out_n_c_ho_wo_host,
                                                             out_indices_n_c_ho_wo_host,
                                                             window_spatial_lengths,
@@ -266,6 +172,8 @@ bool pool_test(bool do_verification,
                                                             input_left_pads,
                                                             input_right_pads);

+        ref_pooling_invoker.Run(ref_pooling_argument);
+
        out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());

        pass = pass && ck::utils::check_err(out_n_c_ho_wo_device, out_n_c_ho_wo_host);

--- a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
@@ -2,7 +2,6 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
-#include <cstdlib>

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -12,7 +11,7 @@

 using InDataType      = ck::half_t;
 using OutDataType     = ck::half_t;
-using AccDataType = float;
+using ComputeDataType = float;

 using IndexDataType = int32_t;

@@ -91,7 +90,7 @@ int main(int argc, char* argv[])

    bool pass = pool_test<InDataType,
                          OutDataType,
-                          AccDataType,
+                          ComputeDataType,
                          IndexDataType,
                          InLayout,
                          OutLayout,

--- a/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
@@ -2,7 +2,6 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
-#include <cstdlib>

 #include "ck/ck.hpp"
 #include "ck/utility/reduction_enums.hpp"
@@ -12,7 +11,7 @@

 using InDataType      = float;
 using OutDataType     = float;
-using AccDataType = float;
+using ComputeDataType = float;

 using IndexDataType = int32_t;

@@ -91,7 +90,7 @@ int main(int argc, char* argv[])

    bool pass = pool_test<InDataType,
                          OutDataType,
-                          AccDataType,
+                          ComputeDataType,
                          IndexDataType,
                          InLayout,
                          OutLayout,

--- a/example/14_gemm_quantization/CMakeLists.txt
+++ b/example/14_gemm_quantization/CMakeLists.txt
@@ -2,5 +2,7 @@
 add_example_executable(example_gemm_dl_quantization_int8 gemm_dl_quantization_int8.cpp)

 # xdlops
-add_example_executable(example_gemm_xdl_bias_relu_quantization_int8 gemm_xdl_bias_relu_quantization_int8.cpp)
-add_example_executable(example_gemm_xdl_quantization_int8 gemm_xdl_quantization_int8.cpp)
\ No newline at end of file
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_example_executable(example_gemm_xdl_bias_relu_quantization_int8 gemm_xdl_bias_relu_quantization_int8.cpp)
+   add_example_executable(example_gemm_xdl_quantization_int8 gemm_xdl_quantization_int8.cpp)
+endif()
\ No newline at end of file
--- a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
+++ b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
-add_custom_target(example_gemm_reduce_xdl)
-add_custom_target(example_gemm_reduce_xdl_max)
-add_custom_target(example_gemm_reduce_xdl_mean_meansquare)
-add_custom_target(example_gemm_add_add_mean_meansquare_xdl)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_custom_target(example_gemm_reduce_xdl)
+   add_custom_target(example_gemm_reduce_xdl_max)
+   add_custom_target(example_gemm_reduce_xdl_mean_meansquare)
+   add_custom_target(example_gemm_add_add_mean_meansquare_xdl)

-add_example_executable(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp)
-add_example_executable(example_gemm_max_xdl_int8 gemm_max_xdl_int8.cpp)
-add_example_executable(example_gemm_max_xdl_fp32 gemm_max_xdl_fp32.cpp)
-add_example_executable(example_gemm_max_xdl_bf16 gemm_max_xdl_bf16.cpp)
+   add_example_executable(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp)
+   add_example_executable(example_gemm_max_xdl_int8 gemm_max_xdl_int8.cpp)
+   add_example_executable(example_gemm_max_xdl_fp32 gemm_max_xdl_fp32.cpp)
+   add_example_executable(example_gemm_max_xdl_bf16 gemm_max_xdl_bf16.cpp)

-add_example_executable(example_gemm_add_add_mean_meansquare_xdl_fp16 gemm_add_add_mean_meansquare_xdl_fp16.cpp)
+   add_example_executable(example_gemm_add_add_mean_meansquare_xdl_fp16 gemm_add_add_mean_meansquare_xdl_fp16.cpp)

-add_example_executable(example_gemm_mean_meansquare_xdl_fp16 gemm_mean_meansquare_xdl_fp16.cpp)
-add_example_executable(example_gemm_mean_meansquare_xdl_fp32 gemm_mean_meansquare_xdl_fp32.cpp)
-add_example_executable(example_gemm_mean_meansquare_xdl_bf16 gemm_mean_meansquare_xdl_bf16.cpp)
-add_example_executable(example_gemm_add_addsquare_xdl_int8 gemm_add_addsquare_xdl_int8.cpp)
+   add_example_executable(example_gemm_mean_meansquare_xdl_fp16 gemm_mean_meansquare_xdl_fp16.cpp)
+   add_example_executable(example_gemm_mean_meansquare_xdl_fp32 gemm_mean_meansquare_xdl_fp32.cpp)
+   add_example_executable(example_gemm_mean_meansquare_xdl_bf16 gemm_mean_meansquare_xdl_bf16.cpp)
+   add_example_executable(example_gemm_add_addsquare_xdl_int8 gemm_add_addsquare_xdl_int8.cpp)

-add_dependencies(example_gemm_reduce_xdl_max
+   add_dependencies(example_gemm_reduce_xdl_max
                 example_gemm_max_xdl_bf16
                 example_gemm_max_xdl_fp16
                 example_gemm_max_xdl_fp32
                 example_gemm_max_xdl_int8)

-add_dependencies(example_gemm_reduce_xdl_mean_meansquare
+   add_dependencies(example_gemm_reduce_xdl_mean_meansquare
                 example_gemm_mean_meansquare_xdl_fp16
                 example_gemm_mean_meansquare_xdl_fp32
                 example_gemm_mean_meansquare_xdl_bf16
                 example_gemm_add_addsquare_xdl_int8)

-add_dependencies(example_gemm_add_add_mean_meansquare_xdl example_gemm_add_add_mean_meansquare_xdl_fp16)
+   add_dependencies(example_gemm_add_add_mean_meansquare_xdl example_gemm_add_add_mean_meansquare_xdl_fp16)

-add_dependencies(example_gemm_reduce_xdl
+   add_dependencies(example_gemm_reduce_xdl
                 example_gemm_reduce_xdl_mean_meansquare
                 example_gemm_reduce_xdl_max
                 example_gemm_add_add_mean_meansquare_xdl)

-if(USE_BITINT_EXTENSION_INT4)
+   if(USE_BITINT_EXTENSION_INT4)
      add_example_executable(example_gemm_max_xdl_int4 gemm_max_xdl_int4.cpp)
      add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_int4)
+   endif()
 endif()
--- a/example/17_convnd_bwd_data/CMakeLists.txt
+++ b/example/17_convnd_bwd_data/CMakeLists.txt
-add_example_executable(example_convnd_bwd_data_xdl_fp16 convnd_bwd_data_xdl_fp16.cpp)
-target_link_libraries(example_convnd_bwd_data_xdl_fp16 PRIVATE utility)
-
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_example_executable(example_convnd_bwd_data_xdl_fp16 convnd_bwd_data_xdl_fp16.cpp)
+   target_link_libraries(example_convnd_bwd_data_xdl_fp16 PRIVATE utility)
+endif()
 add_example_executable(example_convnd_bwd_data_dl_fp16 convnd_bwd_data_dl_fp16.cpp)
 target_link_libraries(example_convnd_bwd_data_dl_fp16 PRIVATE utility)
--- a/example/18_batched_gemm_reduce/CMakeLists.txt
+++ b/example/18_batched_gemm_reduce/CMakeLists.txt
-add_example_executable(example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_example_executable(example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp)
+endif()

--- a/example/20_grouped_conv_bwd_weight/CMakeLists.txt
+++ b/example/20_grouped_conv_bwd_weight/CMakeLists.txt
-add_custom_target(example_grouped_conv_bwd_weight)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_custom_target(example_grouped_conv_bwd_weight)

-add_example_executable(example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp)
-add_example_executable(example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp)
+   add_example_executable(example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp)
+   add_example_executable(example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp)

-
-add_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16
+   add_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16
                                                 example_grouped_conv_bwd_weight_xdl_bf16)
+endif()

 add_custom_target(example_grouped_conv_bwd_weight_dl)


--- a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
+++ b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
@@ -18,7 +18,9 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
    // Set split_k = 2 for xdl op, split_k = 1 for dl
    // Dl op doesn't support split_k > 1
    // TODO: Add Dl op split_k > 1 support
-    if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030"))
+    if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
+         ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
+         ck::get_device_name() == "gfx1102"))
    {
        split_k = 2;
    }

--- a/example/21_gemm_layernorm/CMakeLists.txt
+++ b/example/21_gemm_layernorm/CMakeLists.txt
-add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_welford_fp16 gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp)
-add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_naive_fp16 gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp)
-add_example_executable(example_gemm_layernorm_xdl_naive_fp16 gemm_layernorm_xdl_naive_fp16.cpp)
-add_example_executable(example_gemm_xdl_layernorm_naive_single_kernel_fp16 gemm_xdl_layernorm_naive_single_kernel_fp16.cpp)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_welford_fp16 gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp)
+   add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_naive_fp16 gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp)
+   add_example_executable(example_gemm_layernorm_xdl_naive_fp16 gemm_layernorm_xdl_naive_fp16.cpp)
+   add_example_executable(example_gemm_xdl_layernorm_naive_single_kernel_fp16 gemm_xdl_layernorm_naive_single_kernel_fp16.cpp)
+endif()
--- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -74,141 +75,6 @@ using DeviceOpInstanceMNNN = ck::tensor_operation::device::

 using DeviceOpInstance = DeviceOpInstanceKKNN;

-// hardcoded for NumDimM == NumDimN == NumDimK == 2
-template <ck::index_t NumDimM,
-          ck::index_t NumDimN,
-          ck::index_t NumDimK,
-          typename ADataType,
-          typename BDataType,
-          typename EDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
-struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
-{
-    // Argument
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
-              b_ns_ks_{b_ns_ks},
-              e_ms_ns_{e_ms_ns},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public ck::tensor_operation::device::BaseInvoker
-    {
-        using Argument = ReferenceContraction_M2_N2_K2::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
-                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
-
-                AccDataType v_acc = 0;
-
-                for(int k0 = 0; k0 < K0; ++k0)
-                {
-                    for(int k1 = 0; k1 < K1; ++k1)
-                    {
-                        AccDataType v_a;
-                        AccDataType v_b;
-
-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
-
-                        v_acc += v_a * v_b;
-                    }
-                }
-
-                AccDataType v_c;
-
-                arg.cde_element_op_(v_c, v_acc);
-
-                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
-            };
-
-            make_ParallelTensorFunctor(f_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
-    {
-        return true;
-    }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceContraction_M2_N2_K2"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
 int main(int argc, char* argv[])
 {
    bool do_verification = true;
@@ -385,7 +251,8 @@ int main(int argc, char* argv[])
    {
        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);

-        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
+        using ReferenceOpInstance =
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
                                                                      NumDimN,
                                                                      NumDimK,
                                                                      ADataType,
@@ -393,14 +260,13 @@ int main(int argc, char* argv[])
                                                                      CShuffleDataType,
                                                                      AccDataType,
                                                                      AElementOp,
-                                                                  BElementOp,
-                                                                  PassThrough>;
+                                                                      BElementOp>;

-        auto ref_gemm    = ReferenceOpInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_op      = ReferenceOpInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();

-        auto ref_argument = ref_gemm.MakeArgument(
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+        auto ref_argument =
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);

        ref_invoker.Run(ref_argument);


--- a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -74,141 +75,6 @@ using DeviceOpInstanceMNNN = ck::tensor_operation::device::

 using DeviceOpInstance = DeviceOpInstanceKKNN;

-// hardcoded for NumDimM == NumDimN == NumDimK == 2
-template <ck::index_t NumDimM,
-          ck::index_t NumDimN,
-          ck::index_t NumDimK,
-          typename ADataType,
-          typename BDataType,
-          typename EDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
-struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
-{
-    // Argument
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
-              b_ns_ks_{b_ns_ks},
-              e_ms_ns_{e_ms_ns},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public ck::tensor_operation::device::BaseInvoker
-    {
-        using Argument = ReferenceContraction_M2_N2_K2::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
-                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
-
-                AccDataType v_acc = 0;
-
-                for(int k0 = 0; k0 < K0; ++k0)
-                {
-                    for(int k1 = 0; k1 < K1; ++k1)
-                    {
-                        AccDataType v_a;
-                        AccDataType v_b;
-
-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
-
-                        v_acc += v_a * v_b;
-                    }
-                }
-
-                AccDataType v_c;
-
-                arg.cde_element_op_(v_c, v_acc);
-
-                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
-            };
-
-            make_ParallelTensorFunctor(f_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
-    {
-        return true;
-    }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceContraction_M2_N2_K2"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
 int main(int argc, char* argv[])
 {
    bool do_verification = true;
@@ -385,7 +251,8 @@ int main(int argc, char* argv[])
    {
        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);

-        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
+        using ReferenceOpInstance =
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
                                                                      NumDimN,
                                                                      NumDimK,
                                                                      ADataType,
@@ -393,14 +260,13 @@ int main(int argc, char* argv[])
                                                                      CShuffleDataType,
                                                                      AccDataType,
                                                                      AElementOp,
-                                                                  BElementOp,
-                                                                  PassThrough>;
+                                                                      BElementOp>;

-        auto ref_gemm    = ReferenceOpInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_op      = ReferenceOpInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();

-        auto ref_argument = ref_gemm.MakeArgument(
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+        auto ref_argument =
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);

        ref_invoker.Run(ref_argument);


--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -73,141 +74,6 @@ using DeviceOpInstanceMNN = ck::tensor_operation::device::

 using DeviceOpInstance = DeviceOpInstanceKKN;

-// hardcoded for NumDimM == NumDimN == NumDimK == 2
-template <ck::index_t NumDimM,
-          ck::index_t NumDimN,
-          ck::index_t NumDimK,
-          typename ADataType,
-          typename BDataType,
-          typename EDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
-struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
-{
-    // Argument
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
-              b_ns_ks_{b_ns_ks},
-              e_ms_ns_{e_ms_ns},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public ck::tensor_operation::device::BaseInvoker
-    {
-        using Argument = ReferenceContraction_M2_N2_K2::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
-                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
-
-                AccDataType v_acc = 0;
-
-                for(int k0 = 0; k0 < K0; ++k0)
-                {
-                    for(int k1 = 0; k1 < K1; ++k1)
-                    {
-                        AccDataType v_a;
-                        AccDataType v_b;
-
-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
-
-                        v_acc += v_a * v_b;
-                    }
-                }
-
-                AccDataType v_c;
-
-                arg.cde_element_op_(v_c, v_acc);
-
-                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
-            };
-
-            make_ParallelTensorFunctor(f_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
-    {
-        return true;
-    }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceContraction_M2_N2_K2"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
 int main(int argc, char* argv[])
 {
    bool do_verification = true;
@@ -368,7 +234,8 @@ int main(int argc, char* argv[])
    {
        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);

-        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
+        using ReferenceOpInstance =
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
                                                                      NumDimN,
                                                                      NumDimK,
                                                                      ADataType,
@@ -376,14 +243,14 @@ int main(int argc, char* argv[])
                                                                      CShuffleDataType,
                                                                      AccDataType,
                                                                      AElementOp,
-                                                                  BElementOp,
-                                                                  PassThrough>;
+                                                                      BElementOp>;

-        auto ref_gemm    = ReferenceOpInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_op      = ReferenceOpInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();

-        auto ref_argument = ref_gemm.MakeArgument(
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+        Tensor<float> empty_tensor(std::vector<ck::index_t>{}, std::vector<ck::index_t>{});
+        auto ref_argument =
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);

        ref_invoker.Run(ref_argument);


--- a/example/26_contraction/contraction_scale_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -73,141 +74,6 @@ using DeviceOpInstanceMNN = ck::tensor_operation::device::

 using DeviceOpInstance = DeviceOpInstanceKKN;

-// hardcoded for NumDimM == NumDimN == NumDimK == 2
-template <ck::index_t NumDimM,
-          ck::index_t NumDimN,
-          ck::index_t NumDimK,
-          typename ADataType,
-          typename BDataType,
-          typename EDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
-struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
-{
-    // Argument
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
-              b_ns_ks_{b_ns_ks},
-              e_ms_ns_{e_ms_ns},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public ck::tensor_operation::device::BaseInvoker
-    {
-        using Argument = ReferenceContraction_M2_N2_K2::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
-                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
-
-                AccDataType v_acc = 0;
-
-                for(int k0 = 0; k0 < K0; ++k0)
-                {
-                    for(int k1 = 0; k1 < K1; ++k1)
-                    {
-                        AccDataType v_a;
-                        AccDataType v_b;
-
-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
-
-                        v_acc += v_a * v_b;
-                    }
-                }
-
-                AccDataType v_c;
-
-                arg.cde_element_op_(v_c, v_acc);
-
-                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
-            };
-
-            make_ParallelTensorFunctor(f_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
-    {
-        return true;
-    }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceContraction_M2_N2_K2"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
 int main(int argc, char* argv[])
 {
    bool do_verification = true;
@@ -368,7 +234,8 @@ int main(int argc, char* argv[])
    {
        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);

-        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
+        using ReferenceOpInstance =
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
                                                                      NumDimN,
                                                                      NumDimK,
                                                                      ADataType,
@@ -376,14 +243,14 @@ int main(int argc, char* argv[])
                                                                      CShuffleDataType,
                                                                      AccDataType,
                                                                      AElementOp,
-                                                                  BElementOp,
-                                                                  PassThrough>;
+                                                                      BElementOp>;

-        auto ref_gemm    = ReferenceOpInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_op      = ReferenceOpInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();

-        auto ref_argument = ref_gemm.MakeArgument(
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+        Tensor<float> empty_tensor(std::vector<ck::index_t>{}, std::vector<ck::index_t>{});
+        auto ref_argument =
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);

        ref_invoker.Run(ref_argument);


--- a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+++ b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
-add_custom_target(example_grouped_conv_fwd_multiple_d)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_custom_target(example_grouped_conv_fwd_multiple_d)

-add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp)
-add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp)
-add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp)
-add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp)

-add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16)
-add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32)
-add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16)
-add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8)
+   add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16)
+   add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32)
+   add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16)
+   add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8)

-if(USE_BITINT_EXTENSION_INT4)
+   if(USE_BITINT_EXTENSION_INT4)
      add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp)
-
      add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4)
-endif() # USE_BITINT_EXTENSION_INT4
-
+   endif() # USE_BITINT_EXTENSION_INT4
+   add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp)
+   add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16)
+endif()
 if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
  add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp)
 endif()
-
-add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp)
-
-add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16)
--- a/example/31_batched_gemm_gemm/CMakeLists.txt
+++ b/example/31_batched_gemm_gemm/CMakeLists.txt
-add_example_executable(example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp)
-add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp)
-add_example_executable(example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp)
-if(NOT GPU_TARGETS MATCHES "gfx940")
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_example_executable(example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp)
+   add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp)
+   add_example_executable(example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp)
+   if(NOT GPU_TARGETS MATCHES "gfx940")
      add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
-endif()
+   endif()

-if(USE_BITINT_EXTENSION_INT4)
-add_example_executable(example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp)
-endif(USE_BITINT_EXTENSION_INT4)
+   if(USE_BITINT_EXTENSION_INT4)
+      add_example_executable(example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp)
+   endif(USE_BITINT_EXTENSION_INT4)
+endif()
\ No newline at end of file
--- a/example/35_splitK_gemm/CMakeLists.txt
+++ b/example/35_splitK_gemm/CMakeLists.txt
-add_custom_target(example_splitK_gemm_xdl)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_custom_target(example_splitK_gemm_xdl)
+   add_example_executable(example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp)
+   add_example_executable(example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp)
+   add_example_executable(example_splitK_gemm_xdl_bfp16 splitK_gemm_xdl_bfp16.cpp)
+   add_example_executable(example_splitK_gemm_xdl_int8 splitK_gemm_xdl_int8.cpp)

-add_example_executable(example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp)
-add_example_executable(example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp)
-add_example_executable(example_splitK_gemm_xdl_bfp16 splitK_gemm_xdl_bfp16.cpp)
-add_example_executable(example_splitK_gemm_xdl_int8 splitK_gemm_xdl_int8.cpp)
-
-add_dependencies(example_splitK_gemm_xdl
+   add_dependencies(example_splitK_gemm_xdl
                 example_splitK_gemm_xdl_fp32
                 example_splitK_gemm_xdl_fp16
                 example_splitK_gemm_xdl_bfp16
                 example_splitK_gemm_xdl_int8)

-if(USE_BITINT_EXTENSION_INT4)
+   if(USE_BITINT_EXTENSION_INT4)
      add_example_executable(example_splitK_gemm_xdl_int4 splitK_gemm_xdl_int4.cpp)
      add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_int4)
+   endif()
 endif()
--- a/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
+++ b/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
-add_custom_target(example_grouped_conv_bwd_data)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_custom_target(example_grouped_conv_bwd_data)
+   add_example_executable(example_grouped_conv_bwd_data_fp16 grouped_conv_bwd_data_fp16.cpp)
+   add_example_executable(example_grouped_conv_bwd_data_bias_relu_fp16 grouped_conv_bwd_data_bias_relu_fp16.cpp)

-add_example_executable(example_grouped_conv_bwd_data_fp16 grouped_conv_bwd_data_fp16.cpp)
-add_example_executable(example_grouped_conv_bwd_data_bias_relu_fp16 grouped_conv_bwd_data_bias_relu_fp16.cpp)
-
-add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_fp16)
-add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_fp16)
+   add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_fp16)
+   add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_fp16)
+endif()
\ No newline at end of file
--- a/example/40_conv2d_fwd_quantization/CMakeLists.txt
+++ b/example/40_conv2d_fwd_quantization/CMakeLists.txt
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_example_executable(example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp)
+   add_example_executable(example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_perchannel_quantization_int8.cpp)
+   add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp)
+   add_example_executable(example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp)
+endif()
 # Conv perlayer quantization
 add_example_executable(example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp)
-add_example_executable(example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp)

 # Conv perchannel quantization
 add_example_executable(example_conv2d_fwd_dl_perchannel_quantization_int8 conv2d_fwd_dl_perchannel_quantization_int8.cpp)
-add_example_executable(example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_perchannel_quantization_int8.cpp)

 # Conv + bias + relu perlayer quantization
 add_example_executable(example_conv2d_fwd_dl_bias_relu_perlayer_quantization_int8 conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp)
-add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp)

 # Conv + bias + relu perchannel quantization
 add_example_executable(example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp)
-add_example_executable(example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp)

 # Conv + bias + tanh perlayer quantization
 add_example_executable(example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp)