Reduction external API and client examples (#493)

* Change to the DeviceReduce base class template to include all problem description information * Add external api for reduction * Add client example to test the reduction external api * Spelling correction * Re-implement the host_reduction to follow the DeviceReduce base API format * Change the reduce profiler to call the external API for collecting device instances * Rename reduce client example directory from 08_reduce to 12_reduce * Remove (void) before the functional call * Tiny update in reduce client example * Tiny update in profile_reduce_impl.hpp * Rename the reduce client example directory Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>

Reduction external API and client examples (#493)
* Change to the DeviceReduce base class template to include all problem description information * Add external api for reduction * Add client example to test the reduction external api * Spelling correction * Re-implement the host_reduction to follow the DeviceReduce base API format * Change the reduce profiler to call the external API for collecting device instances * Rename reduce client example directory from 08_reduce to 12_reduce * Remove (void) before the functional call * Tiny update in reduce client example * Tiny update in profile_reduce_impl.hpp * Rename the reduce client example directory Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
80e05267 · Qianfeng · GitHub · 7829d729 · 80e05267 · 80e05267
Unverified Commit 80e05267 authored Jan 17, 2023 by Qianfeng Committed by GitHub Jan 16, 2023
20 changed files
--- a/client_example/15_reduce/CMakeLists.txt
+++ b/client_example/15_reduce/CMakeLists.txt
+add_executable(client_reduce_nhwc_c reduce_nhwc_c.cpp)
+target_link_libraries(client_reduce_nhwc_c PRIVATE composable_kernel::device_operations)
--- a/client_example/15_reduce/reduce_nhwc_c.cpp
+++ b/client_example/15_reduce/reduce_nhwc_c.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <functional>
+#include <numeric>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp"
+
+using InDataType  = float;
+using OutDataType = float;
+using AccDataType = float;
+using ReduceAdd   = ck::reduce::Add;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using UnaryDivide = ck::tensor_operation::element_wise::UnaryDivide;
+
+constexpr bool PropagateNan = false;
+constexpr bool OutputIndex  = false;
+
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 3;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    std::array<ck::index_t, Rank> in_lengths{16, 8, 128, 256};
+    std::array<ck::index_t, Rank> in_strides{8 * 128 * 256, 128 * 256, 256, 1};
+    std::array<ck::index_t, Rank - NumReduceDim> out_lengths{256};
+    std::array<ck::index_t, Rank - NumReduceDim> out_strides{1};
+    std::array<int, NumReduceDim> reduce_dims{0, 1, 2};
+
+    ck::index_t num_in_elements =
+        std::accumulate(in_lengths.begin(), in_lengths.end(), 1, std::multiplies<ck::index_t>());
+
+    ck::index_t num_out_elements =
+        std::accumulate(out_lengths.begin(), out_lengths.end(), 1, std::multiplies<ck::index_t>());
+
+    ck::index_t reduce_length = 1;
+
+    for(auto dim : reduce_dims)
+        reduce_length *= in_lengths[dim];
+
+    float alpha{1.0f};
+    float beta{0.0f};
+
+    SimpleDeviceMem in(sizeof(InDataType) * num_in_elements);
+    SimpleDeviceMem out(sizeof(OutDataType) * num_out_elements);
+
+    using DeviceOp     = ck::tensor_operation::device::DeviceReduce<InDataType,
+                                                                AccDataType,
+                                                                OutDataType,
+                                                                Rank,
+                                                                NumReduceDim,
+                                                                ReduceAdd,
+                                                                PassThrough,
+                                                                UnaryDivide,
+                                                                PropagateNan,
+                                                                OutputIndex>;
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(in_lengths,
+                                                        in_strides,
+                                                        out_lengths,
+                                                        out_strides,
+                                                        reduce_dims,
+                                                        alpha,
+                                                        beta,
+                                                        in.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        out.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        PassThrough{},
+                                                        UnaryDivide{reduce_length});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_bytes = num_in_elements * sizeof(InDataType) +
+                                    (beta == 0.0f ? 1 : 2) * num_out_elements * sizeof(OutDataType);
+
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+
+    // run the best intance
+    if(found)
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths,
+                                                        in_strides,
+                                                        out_lengths,
+                                                        out_strides,
+                                                        reduce_dims,
+                                                        alpha,
+                                                        beta,
+                                                        in.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        out.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        PassThrough{},
+                                                        UnaryDivide{reduce_length});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
--- a/example/12_reduce/reduce_blockwise_impl.hpp
+++ b/example/12_reduce/reduce_blockwise_impl.hpp
@@ -9,6 +9,7 @@
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"

 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -16,7 +17,6 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/host_common_util.hpp"
-#include "ck/library/utility/host_reduction.hpp"

 #include "reduce_example_common.hpp"

@@ -236,38 +236,57 @@ int reduce_blockwise_impl(bool do_verification,
        reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
            static_cast<int32_t>(reduce_total_length));

+    std::array<index_t, Rank> arrInLengths;
+    std::array<index_t, Rank> arrInStrides;
+    std::array<index_t, NumOutDim> arrOutLengths;
+    std::array<index_t, NumOutDim> arrOutStrides;
+
+    ck::ranges::copy(inLengths, arrInLengths.begin());
+    ck::ranges::copy(inStrides, arrInStrides.begin());
+    ck::ranges::copy(outLengths, arrOutLengths.begin());
+    ck::ranges::copy(outStrides, arrOutStrides.begin());
+
    if(do_verification)
    {
-        ReductionHost<InOutDataType,
+        using ReferenceReduceInstance =
+            ck::tensor_operation::host::ReferenceReduce<InOutDataType,
                                                        AccDataType,
                                                        InOutDataType,
+                                                        Rank,
+                                                        NumReduceDim,
                                                        ReduceOperation,
                                                        InElementwiseOperation,
                                                        AccElementwiseOperation,
-                      Rank,
-                      NumReduceDim,
                                                        PropagateNan,
-                      OutputIndex>
-            hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
+                                                        OutputIndex>;

-        hostReduce.Run(alpha,
-                       in.mData.data(),
+        auto reduce_ref = ReferenceReduceInstance{};
+
+        auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths,
+                                                               arrInStrides,
+                                                               arrOutLengths,
+                                                               arrOutStrides,
+                                                               reduceDims,
+                                                               alpha,
                                                               beta,
+                                                               in.mData.data(),
+                                                               nullptr,
                                                               out_ref.mData.data(),
                                                               out_indices_ref.mData.data(),
                                                               in_elementwise_op,
                                                               acc_elementwise_op);
+
+        if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout << "The runtime parameters not supported by the reduce reference, exiting!"
+                      << std::endl;
+            return (false);
        };

-    std::array<index_t, Rank> arrInLengths;
-    std::array<index_t, Rank> arrInStrides;
-    std::array<index_t, NumOutDim> arrOutLengths;
-    std::array<index_t, NumOutDim> arrOutStrides;
+        auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();

-    ck::ranges::copy(inLengths, arrInLengths.begin());
-    ck::ranges::copy(inStrides, arrInStrides.begin());
-    ck::ranges::copy(outLengths, arrOutLengths.begin());
-    ck::ranges::copy(outStrides, arrOutStrides.begin());
+        invoker_ptr_ref->Run(argument_ptr_ref.get());
+    };

    auto reduce = DeviceReduceInstance{};

@@ -287,8 +306,7 @@ int reduce_blockwise_impl(bool do_verification,

    if(!reduce.IsSupportedArgument(argument_ptr.get()))
    {
-        std::cerr
-            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+        std::cerr << "The runtime parameters not supported by the DeviceReduce instance, exiting!"
                  << std::endl;

        return (-2);

--- a/example/12_reduce/reduce_blockwise_two_call.cpp
+++ b/example/12_reduce/reduce_blockwise_two_call.cpp
@@ -12,13 +12,13 @@
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"

 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/host_common_util.hpp"
-#include "ck/library/utility/host_reduction.hpp"

 using namespace ck;
 using namespace ck::tensor_operation::device;
@@ -98,7 +98,7 @@ int main(int argc, char* argv[])

    // used by the host reduction
    const std::array<int, 2> reduceDims = {3, 4};
-    const std::array<int, 3> invariantDims = {0, 1, 2};
+    // const std::array<int, 3> invariantDims = {0, 1, 2};

    const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};

@@ -191,42 +191,61 @@ int main(int argc, char* argv[])
        reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
            static_cast<int32_t>(reduce_total_length));

+    std::array<index_t, 5> arrInLengths_1;
+    std::array<index_t, 5> arrInStrides_1;
+    std::array<index_t, 4> arrInLengths_2;
+    std::array<index_t, 4> arrInStrides_2;
+    std::array<index_t, 3> arrOutLengths;
+    std::array<index_t, 3> arrOutStrides;
+
+    ck::ranges::copy(inLengths_1, arrInLengths_1.begin());
+    ck::ranges::copy(inStrides_1, arrInStrides_1.begin());
+    ck::ranges::copy(inLengths_2, arrInLengths_2.begin());
+    ck::ranges::copy(inStrides_2, arrInStrides_2.begin());
+    ck::ranges::copy(outLengths, arrOutLengths.begin());
+    ck::ranges::copy(outStrides, arrOutStrides.begin());
+
    if(do_verify)
    {
-        ReductionHost<InOutDataType,
+        using ReferenceReduceInstance =
+            ck::tensor_operation::host::ReferenceReduce<InOutDataType,
                                                        AccDataType,
                                                        InOutDataType,
+                                                        5,
+                                                        2,
                                                        ReduceOperation,
                                                        InElementwiseOperation,
                                                        AccElementwiseOperation,
-                      5, // Rank
-                      2, // NumReduceDim
                                                        PropagateNan,
-                      OutputIndex>
-            hostReduce(in_1.mDesc, out_ref.mDesc, invariantDims, reduceDims);
+                                                        OutputIndex>;

-        hostReduce.Run(alpha,
-                       in_1.mData.data(),
+        auto reduce_ref = ReferenceReduceInstance{};
+
+        auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths_1,
+                                                               arrInStrides_1,
+                                                               arrOutLengths,
+                                                               arrOutStrides,
+                                                               reduceDims,
+                                                               alpha,
                                                               beta,
+                                                               in_1.mData.data(),
+                                                               nullptr,
                                                               out_ref.mData.data(),
                                                               nullptr,
                                                               in_elementwise_op,
                                                               acc_elementwise_op);
+
+        if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout << "The runtime parameters not supported by the reduce reference, exiting!"
+                      << std::endl;
+            return (false);
        };

-    std::array<index_t, 5> arrInLengths_1;
-    std::array<index_t, 5> arrInStrides_1;
-    std::array<index_t, 4> arrInLengths_2;
-    std::array<index_t, 4> arrInStrides_2;
-    std::array<index_t, 3> arrOutLengths;
-    std::array<index_t, 3> arrOutStrides;
+        auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();

-    ck::ranges::copy(inLengths_1, arrInLengths_1.begin());
-    ck::ranges::copy(inStrides_1, arrInStrides_1.begin());
-    ck::ranges::copy(inLengths_2, arrInLengths_2.begin());
-    ck::ranges::copy(inStrides_2, arrInStrides_2.begin());
-    ck::ranges::copy(outLengths, arrOutLengths.begin());
-    ck::ranges::copy(outStrides, arrOutStrides.begin());
+        invoker_ptr_ref->Run(argument_ptr_ref.get());
+    };

    auto reduce_1 = DeviceReduceInstance_1{};

@@ -246,8 +265,7 @@ int main(int argc, char* argv[])

    if(!reduce_1.IsSupportedArgument(argument_ptr_1.get()))
    {
-        std::cout
-            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+        std::cout << "The runtime parameters seems supported by the DeviceReduce instance, exiting!"
                  << std::endl;
    };


--- a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
+++ b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
@@ -9,6 +9,7 @@
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"

 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -16,7 +17,6 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/host_common_util.hpp"
-#include "ck/library/utility/host_reduction.hpp"

 #include "reduce_example_common.hpp"

@@ -149,38 +149,57 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
        reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
            static_cast<int32_t>(reduce_total_length));

+    std::array<index_t, Rank> arrInLengths;
+    std::array<index_t, Rank> arrInStrides;
+    std::array<index_t, NumOutDim> arrOutLengths;
+    std::array<index_t, NumOutDim> arrOutStrides;
+
+    ck::ranges::copy(inLengths, arrInLengths.begin());
+    ck::ranges::copy(inStrides, arrInStrides.begin());
+    ck::ranges::copy(outLengths, arrOutLengths.begin());
+    ck::ranges::copy(outStrides, arrOutStrides.begin());
+
    if(do_verification)
    {
-        ReductionHost<InOutDataType,
+        using ReferenceReduceInstance =
+            ck::tensor_operation::host::ReferenceReduce<InOutDataType,
                                                        AccDataType,
                                                        InOutDataType,
+                                                        Rank,
+                                                        NumReduceDim,
                                                        ReduceOperation,
                                                        InElementwiseOperation,
                                                        AccElementwiseOperation,
-                      Rank,
-                      NumReduceDim,
                                                        PropagateNan,
-                      false>
-            hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
+                                                        false>;

-        hostReduce.Run(alpha,
-                       in.mData.data(),
+        auto reduce_ref = ReferenceReduceInstance{};
+
+        auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths,
+                                                               arrInStrides,
+                                                               arrOutLengths,
+                                                               arrOutStrides,
+                                                               reduceDims,
+                                                               alpha,
                                                               beta,
+                                                               in.mData.data(),
+                                                               nullptr,
                                                               out_ref.mData.data(),
                                                               nullptr,
                                                               in_elementwise_op,
                                                               acc_elementwise_op);
+
+        if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout << "The runtime parameters not supported by the reduce reference, exiting!"
+                      << std::endl;
+            return (false);
        };

-    std::array<index_t, Rank> arrInLengths;
-    std::array<index_t, Rank> arrInStrides;
-    std::array<index_t, NumOutDim> arrOutLengths;
-    std::array<index_t, NumOutDim> arrOutStrides;
+        auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();

-    ck::ranges::copy(inLengths, arrInLengths.begin());
-    ck::ranges::copy(inStrides, arrInStrides.begin());
-    ck::ranges::copy(outLengths, arrOutLengths.begin());
-    ck::ranges::copy(outStrides, arrOutStrides.begin());
+        invoker_ptr_ref->Run(argument_ptr_ref.get());
+    };

    auto reduce = DeviceReduceInstance{};

@@ -200,8 +219,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,

    if(!reduce.IsSupportedArgument(argument_ptr.get()))
    {
-        std::cerr
-            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+        std::cerr << "The runtime parameters not supported by the DeviceReduce instance, exiting!"
                  << std::endl;

        return (-2);

--- a/include/ck/tensor_operation/gpu/device/device_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce.hpp
@@ -13,10 +13,16 @@ namespace ck {
 namespace tensor_operation {
 namespace device {

-template <index_t Rank,
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
          index_t NumReduceDim,
+          typename ReduceOperation,
          typename InElementwiseOperation,
-          typename AccElementwiseOperation>
+          typename AccElementwiseOperation,
+          bool PropagateNan,
+          bool OutputIndex>
 struct DeviceReduce : public BaseOperator
 {
    static constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
@@ -39,12 +45,26 @@ struct DeviceReduce : public BaseOperator
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };

-template <index_t Rank,
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
          index_t NumReduceDim,
+          typename ReduceOperation,
          typename InElementwiseOperation,
-          typename AccElementwiseOperation>
-using DeviceReducePtr = std::unique_ptr<
-    DeviceReduce<Rank, NumReduceDim, InElementwiseOperation, AccElementwiseOperation>>;
+          typename AccElementwiseOperation,
+          bool PropagateNan,
+          bool OutputIndex>
+using DeviceReducePtr = std::unique_ptr<DeviceReduce<InDataType,
+                                                     AccDataType,
+                                                     OutDataType,
+                                                     Rank,
+                                                     NumReduceDim,
+                                                     ReduceOperation,
+                                                     InElementwiseOperation,
+                                                     AccElementwiseOperation,
+                                                     PropagateNan,
+                                                     OutputIndex>>;

 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp
@@ -73,7 +73,7 @@ struct DeviceMultipleReduceMultiBlock : public DeviceMultipleReduce<Rank,
        static_for<0, NumReduction, 1>{}([&](auto I) {
            using OutDataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
            flag =
-                flag && ck::reduce::InMemoryDataOperatonSupportedOnDataType<OutMemoryDataOperation,
+                flag && ck::reduce::InMemoryDataOperationSupportedOnDataType<OutMemoryDataOperation,
                                                                             OutDataType>::value;
        });


--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
@@ -40,8 +40,16 @@ template <typename InDataType,
          index_t InSrcVectorDim,
          index_t InSrcVectorSize,
          index_t OutDstVectorSize>
-struct DeviceReduceMultiBlock
-    : public DeviceReduce<Rank, NumReduceDim, InElementwiseOperation, AccElementwiseOperation>
+struct DeviceReduceMultiBlock : public DeviceReduce<InDataType,
+                                                    AccDataType,
+                                                    OutDataType,
+                                                    Rank,
+                                                    NumReduceDim,
+                                                    ReduceOperation,
+                                                    InElementwiseOperation,
+                                                    AccElementwiseOperation,
+                                                    PropagateNan,
+                                                    OutputIndex>
 {
    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
@@ -67,7 +75,7 @@ struct DeviceReduceMultiBlock
    static constexpr bool use_multiblock =
        (OutMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd);

-    static_assert(ck::reduce::InMemoryDataOperatonSupportedOnDataType<OutMemoryDataOperation,
+    static_assert(ck::reduce::InMemoryDataOperationSupportedOnDataType<OutMemoryDataOperation,
                                                                       OutDataType>::value,
                  "The OutDataType must support the specified OutMemoryDataOperation!");


--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
@@ -35,8 +35,17 @@ template <typename InDataType,
          index_t InSrcVectorDim,
          index_t InSrcVectorSize,
          index_t OutDstVectorSize>
-struct DeviceReduceThreadWise
-    : public DeviceReduce<Rank, NumReduceDim, InElementwiseOperation, AccElementwiseOperation>
+struct DeviceReduceThreadWise : public DeviceReduce<InDataType,
+                                                    AccDataType,
+                                                    OutDataType,
+                                                    Rank,
+                                                    NumReduceDim,
+                                                    ReduceOperation,
+                                                    InElementwiseOperation,
+                                                    AccElementwiseOperation,
+                                                    PropagateNan,
+                                                    OutputIndex>
+
 {
    static_assert(Rank <= 6, "Bigger Rank size is not supported!");


--- a/include/ck/utility/reduction_operator.hpp
+++ b/include/ck/utility/reduction_operator.hpp
@@ -251,27 +251,27 @@ constexpr T GetIdentityValueForInMemoryDataOperation(InMemoryDataOperationEnum o
 };

 template <InMemoryDataOperationEnum Operation, typename DataType>
-struct InMemoryDataOperatonSupportedOnDataType
+struct InMemoryDataOperationSupportedOnDataType
 {
    static constexpr bool value = false;
 };

 template <typename DataType>
-struct InMemoryDataOperatonSupportedOnDataType<InMemoryDataOperationEnum::AtomicAdd, DataType>
+struct InMemoryDataOperationSupportedOnDataType<InMemoryDataOperationEnum::AtomicAdd, DataType>
 {
    static constexpr bool value =
        is_same<DataType, float>::value || is_same<DataType, double>::value;
 };

 template <typename DataType>
-struct InMemoryDataOperatonSupportedOnDataType<InMemoryDataOperationEnum::AtomicMax, DataType>
+struct InMemoryDataOperationSupportedOnDataType<InMemoryDataOperationEnum::AtomicMax, DataType>
 {
    static constexpr bool value =
        is_same<DataType, float>::value || is_same<DataType, double>::value;
 };

 template <typename DataType>
-struct InMemoryDataOperatonSupportedOnDataType<InMemoryDataOperationEnum::Set, DataType>
+struct InMemoryDataOperationSupportedOnDataType<InMemoryDataOperationEnum::Set, DataType>
 {
    static constexpr bool value =
        is_same<DataType, float>::value || is_same<DataType, double>::value ||
@@ -280,7 +280,7 @@ struct InMemoryDataOperatonSupportedOnDataType<InMemoryDataOperationEnum::Set, D
 };

 template <typename DataType>
-struct InMemoryDataOperatonSupportedOnDataType<InMemoryDataOperationEnum::Add, DataType>
+struct InMemoryDataOperationSupportedOnDataType<InMemoryDataOperationEnum::Add, DataType>
 {
    static constexpr bool value =
        is_same<DataType, float>::value || is_same<DataType, double>::value ||

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <array>
+#include <algorithm>
+#include <thread>
+
+#include "ck/ck.hpp"
+#include "ck/utility/ignore.hpp"
+#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
+          index_t NumReduceDim,
+          typename ReduceOperation,
+          typename InElementwiseOperation,
+          typename AccElementwiseOperation,
+          bool PropagateNan,
+          bool OutputIndex>
+struct ReferenceReduce : public device::DeviceReduce<InDataType,
+                                                     AccDataType,
+                                                     OutDataType,
+                                                     Rank,
+                                                     NumReduceDim,
+                                                     ReduceOperation,
+                                                     InElementwiseOperation,
+                                                     AccElementwiseOperation,
+                                                     PropagateNan,
+                                                     OutputIndex>
+{
+    using IndexDataType = int32_t;
+
+    static constexpr int NumInvariantDim = Rank - NumReduceDim;
+
+    static constexpr index_t NumSrcDim = Rank;
+    static constexpr index_t NumDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
+    static constexpr bool reduceAllDim = (NumInvariantDim == 0);
+
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const std::array<index_t, Rank> inLengths,
+                 const std::array<index_t, Rank> inStrides,
+                 const std::array<index_t, NumDstDim> outLengths,
+                 const std::array<index_t, NumDstDim> outStrides,
+                 const std::array<int, NumReduceDim> reduceDims,
+                 float alpha,
+                 float beta,
+                 const InDataType* in_host,
+                 OutDataType* out_host,
+                 IndexDataType* out_index_host,
+                 const InElementwiseOperation in_elementwise_op,
+                 const AccElementwiseOperation acc_elementwise_op)
+            : reduceDims_(reduceDims),
+              outLengths_(outLengths),
+              outStrides_(outStrides),
+              in_host_(in_host),
+              out_host_(out_host),
+              out_index_host_(out_index_host),
+              in_elementwise_op_(in_elementwise_op),
+              acc_elementwise_op_(acc_elementwise_op)
+        {
+            using ck::host_common::get_index_set;
+
+            if(std::any_of(
+                   reduceDims.begin(), reduceDims.end(), [](int d) { return d < 0 || d >= Rank; }))
+                throw std::runtime_error("Invalid reduce dimensions!");
+
+            if constexpr(NumInvariantDim > 0)
+            {
+                // get invariant_dims[] and invariant_lengths[]
+                for(int dim = 0, i = 0; dim < Rank; dim++)
+                    if(std::none_of(
+                           reduceDims.begin(), reduceDims.end(), [&](int d) { return d == dim; }))
+                    {
+                        invariantDims_[i]     = dim;
+                        invariant_lengths_[i] = inLengths[dim];
+                        i++;
+                    };
+            };
+
+            // get reduce_lengths_[]
+            for(int j = 0, i = 0; j < NumReduceDim; j++)
+            {
+                int dim              = reduceDims[j];
+                reduce_lengths_[i++] = inLengths[dim];
+            };
+
+            if constexpr(NumInvariantDim > 0)
+            {
+                // check invariant_lengths_ and outLengths
+                for(int i = 0; i < NumInvariantDim; i++)
+                    if(invariant_lengths_[i] != outLengths_[i])
+                        throw std::runtime_error("Invalid lengths parameters!");
+            }
+
+            if constexpr(NumInvariantDim > 0)
+            {
+                for(int j = 0, i = 0; j < NumInvariantDim; j++)
+                {
+                    int dim                  = invariantDims_[j];
+                    in_invariant_strides_[i] = inStrides[dim];
+                    i++;
+                };
+            };
+
+            for(int j = 0, i = 0; j < NumReduceDim; j++)
+            {
+                int dim               = reduceDims_[j];
+                in_reduce_strides_[i] = inStrides[dim];
+                i++;
+            };
+
+            if constexpr(NumInvariantDim > 0)
+                invariant_index_set_ = get_index_set<NumInvariantDim>(invariant_lengths_);
+
+            reduce_index_set_ = get_index_set<NumReduceDim>(reduce_lengths_);
+
+            alpha_ = type_convert<AccDataType>(alpha);
+            beta_  = type_convert<AccDataType>(beta);
+        };
+
+        const std::array<int, NumReduceDim> reduceDims_;
+        std::array<int, NumInvariantDim> invariantDims_;
+        std::array<index_t, NumInvariantDim> invariant_lengths_;
+        std::array<index_t, NumReduceDim> reduce_lengths_;
+
+        const std::array<index_t, NumDstDim> outLengths_;
+        const std::array<index_t, NumDstDim> outStrides_;
+
+        std::array<index_t, NumInvariantDim> in_invariant_strides_;
+        std::array<index_t, NumReduceDim> in_reduce_strides_;
+
+        const InDataType* in_host_;
+        OutDataType* out_host_;
+        IndexDataType* out_index_host_;
+        const InElementwiseOperation in_elementwise_op_;
+        const AccElementwiseOperation acc_elementwise_op_;
+
+        AccDataType alpha_;
+        AccDataType beta_;
+
+        std::vector<std::array<index_t, NumInvariantDim>> invariant_index_set_;
+        std::vector<std::array<index_t, NumReduceDim>> reduce_index_set_;
+    };
+
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            ignore = stream_config;
+
+            using ck::float_equal_one;
+            using ck::float_equal_zero;
+            using ck::type_convert;
+            using ck::host_common::get_index_set;
+            using ck::host_common::get_offset_from_index;
+
+            if constexpr(OutputIndex)
+            {
+                using Accumulation = ck::detail::AccumulateWithIndexAndNanCheck<PropagateNan,
+                                                                                ReduceOperation,
+                                                                                AccDataType,
+                                                                                IndexDataType>;
+
+                if constexpr(NumInvariantDim == 0)
+                {
+                    AccDataType accuVal = ReduceOperation::template GetIdentityValue<AccDataType>();
+                    IndexDataType accuIndex = 0;
+
+                    for(std::size_t i = 0; i < arg.reduce_index_set_.size(); i++)
+                    {
+                        auto in_offset = get_offset_from_index<NumReduceDim>(
+                            arg.in_reduce_strides_, arg.reduce_index_set_[i]);
+
+                        auto currVal = type_convert<AccDataType>(arg.in_host_[in_offset]);
+
+                        arg.in_elementwise_op_(currVal, currVal);
+
+                        auto currIndex = static_cast<IndexDataType>(i);
+
+                        Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
+                    };
+
+                    arg.acc_elementwise_op_(accuVal, accuVal);
+
+                    if(!float_equal_one{}(arg.alpha_))
+                        accuVal *= type_convert<AccDataType>(arg.alpha_);
+
+                    if(!float_equal_zero{}(arg.beta_))
+                        accuVal += type_convert<AccDataType>(arg.out_host_[0]) *
+                                   type_convert<AccDataType>(arg.beta_);
+
+                    arg.out_host_[0]       = type_convert<OutDataType>(accuVal);
+                    arg.out_index_host_[0] = accuIndex;
+                }
+                else
+                {
+                    auto thread_reduce_func = [&](auto invariant_index) {
+                        AccDataType accuVal =
+                            ReduceOperation::template GetIdentityValue<AccDataType>();
+                        IndexDataType accuIndex = 0;
+
+                        auto in_invariant_offset = get_offset_from_index<NumInvariantDim>(
+                            arg.in_invariant_strides_, invariant_index);
+
+                        for(std::size_t i = 0; i < arg.reduce_index_set_.size(); i++)
+                        {
+                            auto in_reduce_offset = get_offset_from_index<NumReduceDim>(
+                                arg.in_reduce_strides_, arg.reduce_index_set_[i]);
+
+                            auto currVal = type_convert<AccDataType>(
+                                arg.in_host_[in_invariant_offset + in_reduce_offset]);
+
+                            arg.in_elementwise_op_(currVal, currVal);
+
+                            auto currIndex = static_cast<IndexDataType>(i);
+
+                            Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
+                        };
+
+                        arg.acc_elementwise_op_(accuVal, accuVal);
+
+                        if(!float_equal_one{}(arg.alpha_))
+                            accuVal *= type_convert<AccDataType>(arg.alpha_);
+
+                        auto dst_offset = get_offset_from_index<NumInvariantDim>(arg.outStrides_,
+                                                                                 invariant_index);
+
+                        if(!float_equal_zero{}(arg.beta_))
+                            accuVal += type_convert<AccDataType>(arg.out_host_[dst_offset]) *
+                                       type_convert<AccDataType>(arg.beta_);
+
+                        arg.out_host_[dst_offset]       = type_convert<OutDataType>(accuVal);
+                        arg.out_index_host_[dst_offset] = accuIndex;
+                    };
+
+                    std::size_t num_thread = std::thread::hardware_concurrency();
+
+                    std::size_t work_per_thread =
+                        (arg.invariant_index_set_.size() + num_thread - 1) / num_thread;
+
+                    std::vector<joinable_thread> threads(num_thread);
+
+                    for(std::size_t it = 0; it < num_thread; ++it)
+                    {
+                        std::size_t i_begin = it * work_per_thread;
+                        std::size_t i_end =
+                            std::min((it + 1) * work_per_thread, arg.invariant_index_set_.size());
+
+                        auto f = [=] {
+                            for(std::size_t i = i_begin; i < i_end; i++)
+                            {
+                                thread_reduce_func(arg.invariant_index_set_[i]);
+                            }
+                        };
+
+                        threads[it] = joinable_thread(f);
+                    }
+                };
+            }
+            else
+            {
+                using Accumulation =
+                    ck::detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
+
+                if constexpr(NumInvariantDim == 0)
+                {
+                    AccDataType accuVal = ReduceOperation::template GetIdentityValue<AccDataType>();
+
+                    for(const auto& reduce_index : arg.reduce_index_set_)
+                    {
+                        auto in_offset = get_offset_from_index<NumReduceDim>(arg.in_reduce_strides_,
+                                                                             reduce_index);
+
+                        auto currVal = type_convert<AccDataType>(arg.in_host_[in_offset]);
+
+                        arg.in_elementwise_op_(currVal, currVal);
+
+                        Accumulation::Calculate(accuVal, currVal);
+                    };
+
+                    arg.acc_elementwise_op_(accuVal, accuVal);
+
+                    if(!float_equal_one{}(arg.alpha_))
+                        accuVal *= type_convert<AccDataType>(arg.alpha_);
+
+                    if(!float_equal_zero{}(arg.beta_))
+                        accuVal += type_convert<AccDataType>(arg.out_host_[0]) *
+                                   type_convert<AccDataType>(arg.beta_);
+
+                    arg.out_host_[0] = type_convert<OutDataType>(accuVal);
+                }
+                else
+                {
+                    auto thread_reduce_func = [&](auto invariant_index) {
+                        AccDataType accuVal =
+                            ReduceOperation::template GetIdentityValue<AccDataType>();
+
+                        auto in_invariant_offset = get_offset_from_index<NumInvariantDim>(
+                            arg.in_invariant_strides_, invariant_index);
+
+                        for(const auto& reduce_index : arg.reduce_index_set_)
+                        {
+                            auto in_reduce_offset = get_offset_from_index<NumReduceDim>(
+                                arg.in_reduce_strides_, reduce_index);
+
+                            auto currVal = type_convert<AccDataType>(
+                                arg.in_host_[in_invariant_offset + in_reduce_offset]);
+
+                            arg.in_elementwise_op_(currVal, currVal);
+
+                            Accumulation::Calculate(accuVal, currVal);
+                        };
+
+                        arg.acc_elementwise_op_(accuVal, accuVal);
+
+                        if(!float_equal_one{}(arg.alpha_))
+                            accuVal *= type_convert<AccDataType>(arg.alpha_);
+
+                        auto dst_offset = get_offset_from_index<NumInvariantDim>(arg.outStrides_,
+                                                                                 invariant_index);
+
+                        if(!float_equal_zero{}(arg.beta_))
+                            accuVal += type_convert<AccDataType>(arg.out_host_[dst_offset]) *
+                                       type_convert<AccDataType>(arg.beta_);
+
+                        arg.out_host_[dst_offset] = type_convert<OutDataType>(accuVal);
+                    };
+
+                    std::size_t num_thread = std::thread::hardware_concurrency();
+
+                    std::size_t work_per_thread =
+                        (arg.invariant_index_set_.size() + num_thread - 1) / num_thread;
+
+                    std::vector<joinable_thread> threads(num_thread);
+
+                    for(std::size_t it = 0; it < num_thread; ++it)
+                    {
+                        std::size_t i_begin = it * work_per_thread;
+                        std::size_t i_end =
+                            std::min((it + 1) * work_per_thread, arg.invariant_index_set_.size());
+
+                        auto f = [=] {
+                            for(std::size_t i = i_begin; i < i_end; i++)
+                            {
+                                thread_reduce_func(arg.invariant_index_set_[i]);
+                            }
+                        };
+
+                        threads[it] = joinable_thread(f);
+                    }
+                };
+            };
+
+            return (0.0f);
+        };
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        };
+    };
+
+    bool IsSupportedArgument(const device::BaseArgument* p_arg) override
+    {
+        ignore = p_arg;
+
+        return true;
+    };
+
+    std::unique_ptr<device::BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, Rank> inLengths,
+                        const std::array<index_t, Rank> inStrides,
+                        const std::array<index_t, NumDstDim> outLengths,
+                        const std::array<index_t, NumDstDim> outStrides,
+                        const std::array<int, NumReduceDim> reduceDims,
+                        float alpha,
+                        float beta,
+                        const void* in_host,
+                        const void* in_index_host,
+                        void* out_host,
+                        void* out_index_host,
+                        const InElementwiseOperation in_elementwise_op,
+                        const AccElementwiseOperation acc_elementwise_op) override
+    {
+        ignore = in_index_host;
+
+        return std::make_unique<Argument>(inLengths,
+                                          inStrides,
+                                          outLengths,
+                                          outStrides,
+                                          reduceDims,
+                                          alpha,
+                                          beta,
+                                          static_cast<const InDataType*>(in_host),
+                                          static_cast<OutDataType*>(out_host),
+                                          static_cast<IndexDataType*>(out_index_host),
+                                          in_elementwise_op,
+                                          acc_elementwise_op);
+    };
+
+    std::unique_ptr<device::BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "Reference_Reduce<" << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
@@ -76,8 +76,16 @@ template <typename InDataType,
          bool PropagateNan,
          bool OutputIndex>
 void add_device_reduce_instance_blockwise(
-    std::vector<DeviceReducePtr<Rank, NumReduceDim, InElementwiseOp, AccElementwiseOp>>&
-        device_op_instances)
+    std::vector<DeviceReducePtr<InDataType,
+                                AccDataType,
+                                OutDataType,
+                                Rank,
+                                NumReduceDim,
+                                ReduceOperation,
+                                InElementwiseOp,
+                                AccElementwiseOp,
+                                PropagateNan,
+                                OutputIndex>>& device_op_instances)
 {
    static_for<0, std::tuple_size<reduce_configuration_1_instances_blockwise>::value, 1>{}(
        [&](auto i) {

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp
@@ -15,10 +15,10 @@ namespace instance {

 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>>&);
 // clang-format on

 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp
@@ -15,14 +15,14 @@ namespace instance {

 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
 // clang-format on

 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp
@@ -15,10 +15,10 @@ namespace instance {

 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>>&);
 // clang-format on

 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp
@@ -15,14 +15,14 @@ namespace instance {

 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>>&);
 // clang-format on

 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp
@@ -15,14 +15,14 @@ namespace instance {

 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>>&);
 // clang-format on

 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp
@@ -15,10 +15,10 @@ namespace instance {

 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
-extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>>&);
 // clang-format on

 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp
@@ -15,14 +15,14 @@ namespace instance {

 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F16, F16, F16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F16, F16, F16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F16, F16, F16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F16, F16, F16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F16, F16, F16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F16, F16, F16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F16, F16, F16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F16, F16, F16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
 // clang-format on

 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp
@@ -15,14 +15,14 @@ namespace instance {

 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
-extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<F16, F16, F16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<F16, F16, F16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<F16, F16, F16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<F16, F16, F16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<F16, F16, F16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<F16, F16, F16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<F16, F16, F16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<F16, F16, F16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>>&);
 // clang-format on

 } // namespace instance