merge develop

9dce6851 · Jing Zhang · 3cc57101 · 5d37d7bf · 9dce6851 · 9dce6851
Commit 9dce6851 authored Mar 10, 2022 by Jing Zhang
20 changed files
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp
+#include "device_reduce_instance_blockwise_second_call.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0);       //
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1);       //
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0);       //
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1);       //
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0);       //
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1);       //
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0);       //
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1);       //
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0);       //
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1);       //
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0);       //
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1);       //
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0);       //
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1);       //
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0);       //
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1);       //
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp
+#include "device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 0);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 2, 1);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 2, 1);       //
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp
+#include "device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1);       //
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp
+#include "device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1);       //
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp
+#include "device_reduce_instance_multiblock_partial_reduce.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);       //
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp
+#include "device_reduce_instance_multiblock_partial_reduce.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);       //
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp
+#include "device_reduce_instance_multiblock_partial_reduce.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1);       //
+
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1);       //
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp
+#include "device_reduce_instance_multiblock_partial_reduce.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1);       //
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp
+#include "device_reduce_instance_multiblock_partial_reduce.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1);       //
+
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1);       //
+
+// Will be moved to use MultiBlockAtomicAdd
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0);       //
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1);       //
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp
+#include "device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);       //
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp
+#include "device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
+ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD
+ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0);
+ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
+ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG
+ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2
+ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);       //
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp
+#include "device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
+ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
+ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0);
+ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1);
+ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
+ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
+ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN
+ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX
+ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX
+ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN
+ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX
+ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX
+ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1);       //
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp
+#include "device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
+ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
+ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0);
+ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1);
+ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
+ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
+ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1);       //
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp
+#include "device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
+ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD
+ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0);
+ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);
+ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG
+ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2
+ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN
+ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX
+ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX
+ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN
+ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX
+ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1);       //
+ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX
+ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0);       //
+ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1);       //
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
--- a/profiler/CMakeLists.txt
+++ b/profiler/CMakeLists.txt
 include_directories(BEFORE
-    include
-    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
-    ${PROJECT_SOURCE_DIR}/device/include
-    ${PROJECT_SOURCE_DIR}/device_operation/include
-    ${PROJECT_SOURCE_DIR}/reference_operation/include
+    ${PROJECT_SOURCE_DIR}/include/ck
+    ${PROJECT_SOURCE_DIR}/include/ck/utility
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_description
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor
+    ${PROJECT_SOURCE_DIR}/include/ck/problem_transform
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/device
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/grid
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/block
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/warp
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu
    ${PROJECT_SOURCE_DIR}/profiler/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
-    ${PROJECT_SOURCE_DIR}/external/rocm/include
+    ${PROJECT_SOURCE_DIR}/external/include/half
 )

 # ck_profiler
@@ -20,22 +26,26 @@ set(PROFILER_SOURCE
    src/profile_gemm_bias_2d.cpp
    src/profile_gemm_bias_relu.cpp
    src/profile_gemm_bias_relu_add.cpp
+    src/profile_batched_gemm.cpp
    src/profile_conv_fwd.cpp
    src/profile_conv_fwd_bias_relu.cpp
    src/profile_conv_fwd_bias_relu_add.cpp
    src/profile_conv_fwd_bias_relu_atomic_add.cpp
-    src/profile_batched_gemm.cpp
+    src/profile_conv_bwd_data.cpp
+    src/profile_reduce.cpp
 )

 add_executable(ckProfiler ${PROFILER_SOURCE})

 target_link_libraries(ckProfiler PRIVATE host_tensor)
 target_link_libraries(ckProfiler PRIVATE device_gemm_instance)
-target_link_libraries(ckProfiler PRIVATE device_gemm_bias_2d_instance)
+target_link_libraries(ckProfiler PRIVATE device_gemm_bias2d_instance)
 target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_instance)
 target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_add_instance)
+target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance)
-target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance)
+target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_data_instance)
+target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
--- a/profiler/include/profile_conv_bwd_data_impl.hpp
+++ b/profiler/include/profile_conv_bwd_data_impl.hpp
+#pragma once
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "tensor_layout.hpp"
+#include "device_tensor.hpp"
+#include "device_conv_bwd_data.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_conv_bwd_data.hpp"
+
+using F16  = ck::half_t;
+using F32  = float;
+using BF16 = ushort;
+using INT8 = int8_t;
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_conv2d_bwd_data_instance {
+
+using DeviceConvBwdDataNoOpPtr =
+    DeviceConvBwdDataPtr<ck::tensor_operation::element_wise::PassThrough,
+                         ck::tensor_operation::element_wise::PassThrough,
+                         ck::tensor_operation::element_wise::PassThrough>;
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+} // namespace device_conv2d_bwd_data_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+namespace ck {
+namespace profiler {
+
+template <int NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+void profile_conv_bwd_data_impl(int do_verification,
+                                int init_method,
+                                bool do_log,
+                                int nrepeat,
+                                ck::index_t N,
+                                ck::index_t K,
+                                ck::index_t C,
+                                std::vector<ck::index_t> input_spatial_lengths,
+                                std::vector<ck::index_t> filter_spatial_lengths,
+                                std::vector<ck::index_t> output_spatial_lengths,
+                                std::vector<ck::index_t> conv_filter_strides,
+                                std::vector<ck::index_t> conv_filter_dilations,
+                                std::vector<ck::index_t> input_left_pads,
+                                std::vector<ck::index_t> input_right_pads)
+{
+    const ck::index_t Y = filter_spatial_lengths[0];
+    const ck::index_t X = filter_spatial_lengths[1];
+
+    const ck::index_t Hi = input_spatial_lengths[0];
+    const ck::index_t Wi = input_spatial_lengths[1];
+
+    const ck::index_t Ho = output_spatial_lengths[0];
+    const ck::index_t Wo = output_spatial_lengths[1];
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
+            if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
+            }
+            else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
+            }
+        };
+
+    Tensor<InDataType> in_n_c_hi_wi_host_result(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<InDataType> in_n_c_hi_wi_device_result(
+        f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi_host_result.mDesc << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    default:
+        out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+    }
+
+    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    if(do_verification)
+    {
+        using ReferenceConvBwdDataInstance =
+            ck::tensor_operation::host::ReferenceConvBwdData<InDataType,
+                                                             WeiDataType,
+                                                             OutDataType,
+                                                             InElementOp,
+                                                             WeiElementOp,
+                                                             OutElementOp>;
+
+        auto ref_conv     = ReferenceConvBwdDataInstance{};
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi_host_result,
+                                                  wei_k_c_y_x,
+                                                  out_n_k_ho_wo,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) *
+                            in_n_c_hi_wi_device_result.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_n_k_ho_wo.mDesc.GetElementSpace());
+
+    out_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using DeviceConvBwdDataNoOpPtr =
+        ck::tensor_operation::device::DeviceConvBwdDataPtr<PassThrough, PassThrough, PassThrough>;
+
+    // add device Conv instances
+    std::vector<DeviceConvBwdDataNoOpPtr> conv_ptrs;
+    if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, float> &&
+                 ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> &&
+                 ck::is_same_v<ck::remove_cv_t<OutDataType>, float>)
+    {
+        ck::tensor_operation::device::device_conv2d_bwd_data_instance::
+            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
+    }
+    else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
+                      ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::half_t> &&
+                      ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::half_t>)
+    {
+        ck::tensor_operation::device::device_conv2d_bwd_data_instance::
+            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
+    }
+    else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ushort> &&
+                      ck::is_same_v<ck::remove_cv_t<WeiDataType>, ushort> &&
+                      ck::is_same_v<ck::remove_cv_t<OutDataType>, ushort>)
+    {
+        ck::tensor_operation::device::device_conv2d_bwd_data_instance::
+            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
+    }
+    else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, int8_t> &&
+                      ck::is_same_v<ck::remove_cv_t<WeiDataType>, int8_t> &&
+                      ck::is_same_v<ck::remove_cv_t<OutDataType>, int8_t>)
+    {
+        ck::tensor_operation::device::device_conv2d_bwd_data_instance::
+            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs);
+    }
+
+    if(conv_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device Conv instance found");
+    }
+
+    std::string best_conv_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device Conv instances
+    for(auto& conv_ptr : conv_ptrs)
+    {
+        auto argument_ptr = conv_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            N,
+            K,
+            C,
+            input_spatial_lengths,
+            filter_spatial_lengths,
+            output_spatial_lengths,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+
+        auto invoker_ptr = conv_ptr->MakeInvokerPointer();
+
+        if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string conv_name = conv_ptr->GetTypeString();
+
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+
+            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+
+            std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
+                                    sizeof(WeiDataType) * (K * C * Y * X) +
+                                    sizeof(OutDataType) * (N * K * Ho * Wo);
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << conv_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_conv_name  = conv_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data());
+
+                check_error(in_n_c_hi_wi_host_result, in_n_c_hi_wi_device_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "in : ", out_n_k_ho_wo.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_host  : ", in_n_c_hi_wi_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_device: ", in_n_c_hi_wi_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profile_conv_fwd_impl.hpp
+++ b/profiler/include/profile_conv_fwd_impl.hpp
@@ -174,9 +174,9 @@ void profile_conv_fwd_impl(int do_verification,
        ck::tensor_operation::device::device_conv2d_fwd_instance::
            add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
    }
-    else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ushort> &&
-                      ck::is_same_v<ck::remove_cv_t<WeiDataType>, ushort> &&
-                      ck::is_same_v<ck::remove_cv_t<OutDataType>, ushort>)
+    else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, bhalf_t> &&
+                      ck::is_same_v<ck::remove_cv_t<WeiDataType>, bhalf_t> &&
+                      ck::is_same_v<ck::remove_cv_t<OutDataType>, bhalf_t>)
    {
        ck::tensor_operation::device::device_conv2d_fwd_instance::
            add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);

--- a/profiler/include/profile_gemm_impl.hpp
+++ b/profiler/include/profile_gemm_impl.hpp
@@ -26,11 +26,17 @@ void add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(std::vector<DeviceGemmNo
 void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
 void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);

+void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances(
+    std::vector<DeviceGemmNoOpPtr>&);
+
 void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
 void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
 void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
 void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);

+void add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances(
+    std::vector<DeviceGemmNoOpPtr>&);
+
 void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(
    std::vector<DeviceGemmNoOpPtr>&);

@@ -91,12 +97,11 @@ void profile_gemm_impl(int do_verification,

    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
-    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));

    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;

    std::size_t num_thread = std::thread::hardware_concurrency();
    switch(init_method)
@@ -122,19 +127,10 @@ void profile_gemm_impl(int do_verification,
    const auto b_element_op = BElementOp{};
    const auto c_element_op = CElementOp{};

-    if(do_verification)
-    {
-        using ReferenceGemmInstance = ck::tensor_operation::host::
-            ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
-
-        auto ref_gemm    = ReferenceGemmInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
+    // if(do_verification)
+    // {

-        auto ref_argument = ref_gemm.MakeArgument(
-            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
-
-        ref_invoker.Run(ref_argument);
-    }
+    // }

    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
@@ -290,6 +286,29 @@ void profile_gemm_impl(int do_verification,
            }
        }
    }
+    else if constexpr(is_same<ADataType, ck::bhalf_t>::value &&
+                      is_same<BDataType, ck::bhalf_t>::value &&
+                      is_same<CDataType, ck::bhalf_t>::value)
+    {
+        if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
+                     is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
+                     is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::device_gemm_instance::
+                add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances(gemm_ptrs);
+        }
+    }
+    else if constexpr(is_same<ADataType, int8_t>::value && is_same<BDataType, int8_t>::value &&
+                      is_same<CDataType, int8_t>::value)
+    {
+        if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
+                     is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
+                     is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::device_gemm_instance::
+                add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances(gemm_ptrs);
+        }
+    }

    if(gemm_ptrs.size() <= 0)
    {
@@ -351,14 +370,79 @@ void profile_gemm_impl(int do_verification,
            {
                c_device_buf.FromDevice(c_m_n_device_result.mData.data());

-                check_error(c_m_n_host_result, c_m_n_device_result);
+                if constexpr(is_same<ADataType, ck::bhalf_t>::value &&
+                             is_same<BDataType, ck::bhalf_t>::value &&
+                             is_same<CDataType, ck::bhalf_t>::value)
+                {
+                    Tensor<float> a_f32_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+                    Tensor<float> b_f32_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+                    Tensor<float> c_m_n_host_result(
+                        f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+                    Tensor<float> c_m_n_device_f32_result(
+                        f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+                    bf16_to_f32_(a_m_k, a_f32_m_k);
+                    bf16_to_f32_(b_k_n, b_f32_k_n);
+                    bf16_to_f32_(c_m_n_device_result, c_m_n_device_f32_result);
+
+                    using ReferenceGemmInstance = ck::tensor_operation::host::
+                        ReferenceGemm<float, float, float, AElementOp, BElementOp, CElementOp>;
+
+                    auto ref_gemm    = ReferenceGemmInstance{};
+                    auto ref_invoker = ref_gemm.MakeInvoker();
+
+                    auto ref_argument = ref_gemm.MakeArgument(a_f32_m_k,
+                                                              b_f32_k_n,
+                                                              c_m_n_host_result,
+                                                              a_element_op,
+                                                              b_element_op,
+                                                              c_element_op);
+
+                    ref_invoker.Run(ref_argument);
+
+                    check_error(c_m_n_host_result, c_m_n_device_f32_result);
+
+                    if(do_log)
+                    {
+                        LogRangeAsType<float>(
+                            std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                            << std::endl;
+                    }
+                }
+                else
+                {
+                    Tensor<CDataType> c_m_n_host_result(
+                        f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+                    using ReferenceGemmInstance =
+                        ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                  BDataType,
+                                                                  CDataType,
+                                                                  AElementOp,
+                                                                  BElementOp,
+                                                                  CElementOp>;
+
+                    auto ref_gemm    = ReferenceGemmInstance{};
+                    auto ref_invoker = ref_gemm.MakeInvoker();
+
+                    auto ref_argument = ref_gemm.MakeArgument(
+                        a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+                    ref_invoker.Run(ref_argument);
+                    check_error(c_m_n_host_result, c_m_n_device_result);
+
+                    if(do_log)
+                    {
+                        LogRangeAsType<float>(
+                            std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                            << std::endl;
+                    }
+                }

                if(do_log)
                {
                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
-                        << std::endl;
                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
                        << std::endl;
                }

--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
+#pragma once
+#include "device_reduce.hpp"
+#include "device_reduce_instance.hpp"
+#include "reduction_enums.hpp"
+#include "host_generic_reduction.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+template <int Rank, typename ReduceDims, int ReduceOpId, int NanOpt, int IndicesOpt>
+struct ReduceDescription
+{
+    static constexpr int Rank_       = Rank;
+    static constexpr int ReduceOpId_ = ReduceOpId;
+    static constexpr int NanOpt_     = NanOpt;
+    static constexpr int IndicesOpt_ = IndicesOpt;
+
+    using ReduceDims_ = ReduceDims;
+};
+
+using reduce_description_instances =
+    std::tuple<ReduceDescription<4, Sequence<0, 1, 2>, 0, 0, 0>, // for ADD
+               ReduceDescription<4, Sequence<0>, 0, 0, 0>,
+               ReduceDescription<2, Sequence<1>, 0, 0, 0>,
+
+               ReduceDescription<4, Sequence<0, 1, 2>, 5, 0, 0>, // for AVG
+               ReduceDescription<4, Sequence<0>, 5, 0, 0>,
+               ReduceDescription<2, Sequence<1>, 5, 0, 0>,
+
+               ReduceDescription<4, Sequence<0, 1, 2>, 7, 0, 0>, // for NORM2
+               ReduceDescription<4, Sequence<0>, 7, 0, 0>,
+               ReduceDescription<2, Sequence<1>, 7, 0, 0>,
+
+               ReduceDescription<4, Sequence<0, 1, 2>, 2, 0, 0>, // for MIN
+               ReduceDescription<4, Sequence<0>, 2, 0, 0>,
+               ReduceDescription<2, Sequence<1>, 2, 0, 0>,
+               ReduceDescription<4, Sequence<0, 1, 2>, 3, 0, 0>, // for MAX
+               ReduceDescription<4, Sequence<0>, 3, 0, 0>,
+               ReduceDescription<2, Sequence<1>, 3, 0, 0>,
+               ReduceDescription<4, Sequence<0, 1, 2>, 4, 0, 0>, // for AMAX
+               ReduceDescription<4, Sequence<0>, 4, 0, 0>,
+               ReduceDescription<2, Sequence<1>, 4, 0, 0>,
+
+               ReduceDescription<4, Sequence<0, 1, 2>, 2, 0, 1>, // for MIN
+               ReduceDescription<4, Sequence<0>, 2, 0, 1>,
+               ReduceDescription<2, Sequence<1>, 2, 0, 1>,
+               ReduceDescription<4, Sequence<0, 1, 2>, 3, 0, 1>, // for MAX
+               ReduceDescription<4, Sequence<0>, 3, 0, 1>,
+               ReduceDescription<2, Sequence<1>, 3, 0, 1>,
+               ReduceDescription<4, Sequence<0, 1, 2>, 4, 0, 1>, // for AMAX
+               ReduceDescription<4, Sequence<0>, 4, 0, 1>,
+               ReduceDescription<2, Sequence<1>, 4, 0, 1>>;
+
+template <typename DescriptionType>
+bool description_match(const DescriptionType& description,
+                       int Rank,
+                       const std::vector<int>& ReduceDims,
+                       ReduceTensorOp_t ReduceOpId,
+                       NanPropagation_t NanOpt,
+                       ReduceTensorIndices_t IndicesOpt)
+{
+    if(description.Rank_ != Rank || description.ReduceOpId_ != static_cast<int>(ReduceOpId) ||
+       description.NanOpt_ != static_cast<int>(NanOpt) ||
+       description.IndicesOpt_ != static_cast<int>(IndicesOpt))
+        return (false);
+
+    if(DescriptionType::ReduceDims_::Size() != ReduceDims.size())
+        return (false);
+
+    bool result = true;
+
+    static_for<0, DescriptionType::ReduceDims_::Size(), 1>{}([&](auto i) {
+        if(DescriptionType::ReduceDims_::At(i) != ReduceDims[i])
+            result = false;
+    });
+
+    return (result);
+};
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+namespace ck {
+namespace profiler {
+
+template <int Rank, typename ReduceDims>
+static std::vector<int> get_reduce_dims()
+{
+    std::vector<int> resDims;
+
+    static_for<0, ReduceDims::Size(), 1>{}([&](auto i) { resDims.push_back(ReduceDims::At(i)); });
+
+    return (resDims);
+};
+
+template <int Rank, typename ReduceDims>
+static std::vector<int> get_invariant_dims()
+{
+    std::vector<int> resDims;
+    unsigned int incFlag = 0;
+
+    static_for<0, ReduceDims::Size(), 1>{}(
+        [&](auto i) { incFlag = incFlag | (0x1 << ReduceDims::At(i)); });
+
+    for(int dim = 0; dim < Rank; dim++)
+    {
+        if(incFlag & (0x1 << dim))
+            continue;
+        resDims.push_back(dim);
+    };
+
+    return (resDims);
+};
+
+template <typename T>
+static void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems)
+{
+    std::ofstream outFile(fileName, std::ios::binary);
+    if(outFile)
+    {
+        outFile.write(reinterpret_cast<char*>(data), dataNumItems * sizeof(T));
+        outFile.close();
+        std::cout << "Write output to file " << fileName << std::endl;
+    }
+    else
+    {
+        std::cout << "Could not open file " << fileName << " for writing" << std::endl;
+    }
+};
+
+// map the data type used by the GPU kernels to the corresponding type used by the host codes
+template <typename inDataType>
+struct type_mapping
+{
+    using outDataType = inDataType;
+};
+
+template <>
+struct type_mapping<ck::half_t>
+{
+    using outDataType = half_float::half;
+};
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          int Rank,
+          typename ReduceDims_,
+          ReduceTensorOp_t ReduceOpId,
+          NanPropagation_t NanOpt,
+          ReduceTensorIndices_t IndicesOpt>
+void profile_reduce_impl_impl(bool do_verification,
+                              int init_method,
+                              bool do_log,
+                              bool do_dumpout,
+                              int nrepeat,
+                              const std::vector<size_t>& inLengths,
+                              float alpha,
+                              float beta)
+{
+    using namespace ck::tensor_operation::device;
+    using namespace ck::tensor_operation::device::device_reduce_instance;
+    using namespace ck::host_reduce;
+
+    constexpr bool op_support_indices =
+        (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
+         ReduceOpId == ReduceTensorOp_t::AMAX);
+
+    constexpr bool NeedIndices =
+        (op_support_indices && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES));
+
+    constexpr bool PropagateNan = (NanOpt == NanPropagation_t::PROPAGATE_NAN);
+
+    constexpr bool out_support_atomic_add = std::is_same<OutDataType, float>::value;
+    constexpr bool op_support_atomic_add =
+        !op_support_indices && ReduceOpId != ReduceTensorOp_t::NORM2;
+    constexpr bool use_atomic_add = (out_support_atomic_add && op_support_atomic_add);
+
+    // 1) If InDataType is half_t, must use half_t as AccDataType for indexable reduction operations
+    // 2) If InDataType is half_t, must use float as AccDataType for non-indexable reduction
+    // operations
+    constexpr bool invalid_reduce_1 =
+        std::is_same<InDataType, half_t>::value &&
+        ((!op_support_indices && !std::is_same<AccDataType, float>::value) ||
+         (op_support_indices && !std::is_same<AccDataType, half_t>::value));
+
+    // 1) If InDataType is float, must use float as AccDataType for indexable reduction operations
+    constexpr bool invalid_reduce_2 =
+        std::is_same<InDataType, float>::value &&
+        (op_support_indices && !std::is_same<AccDataType, float>::value);
+
+    // 1) The indices can only be used when the reduction operation is indexable
+    constexpr bool invalid_reduce_3 =
+        (!op_support_indices && IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+
+    constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3);
+
+    if constexpr(!invalid_reduce)
+    {
+        Tensor<InDataType> in(inLengths);
+
+        const std::vector<int> OuterDims  = get_invariant_dims<Rank, ReduceDims_>();
+        const std::vector<int> ReduceDims = get_reduce_dims<Rank, ReduceDims_>();
+
+        std::vector<size_t> outLengths;
+
+        if(OuterDims.empty())
+            outLengths.push_back(1);
+        else
+            for(auto dim : OuterDims)
+                outLengths.push_back(inLengths[dim]);
+
+        Tensor<OutDataType> out_ref(outLengths);
+        Tensor<OutDataType> out(outLengths);
+        Tensor<int> out_indices_ref(outLengths);
+        Tensor<int> out_indices(outLengths);
+
+        auto inStrides  = in.mDesc.GetStrides();
+        auto outStrides = out.mDesc.GetStrides();
+
+        size_t invariant_total_length = out.mDesc.GetElementSize();
+        size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
+
+        std::size_t num_thread = std::thread::hardware_concurrency();
+
+        if(do_verification)
+        {
+            switch(init_method)
+            {
+            case 0:
+                in.GenerateTensorValue(GeneratorTensor_1<InDataType>{}, num_thread);
+                if(beta != 0.0f)
+                    out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{}, num_thread);
+                break;
+            case 1:
+                in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
+                if(beta != 0.0f)
+                    out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
+                break;
+            default:
+                in.GenerateTensorValue(GeneratorTensor_2<InDataType>{1, 5}, num_thread);
+                if(beta != 0.0f)
+                    out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{1, 5}, num_thread);
+            }
+
+            if(beta != 0.0f)
+                for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++)
+                    out.mData[i] = out_ref.mData[i];
+        };
+
+        // these buffers are usually provided by the user application
+        DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
+        DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
+
+        in_dev.ToDevice(in.mData.data());
+
+        if(beta != 0.0f)
+            out_dev.ToDevice(out.mData.data());
+
+        size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0;
+
+        DeviceMem out_indices_dev(indicesSizeInBytes);
+
+        float best_avg_time   = 0;
+        float best_gb_per_sec = 0;
+
+        using InElementwiseOperation_0 =
+            typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
+                InElementwiseOperation;
+        using AccElementwiseOperation_0 =
+            typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
+                AccElementwiseOperation;
+        using InElementwiseOperation_1 =
+            typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
+                InElementwiseOperation;
+        using AccElementwiseOperation_1 =
+            typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
+                AccElementwiseOperation;
+        using InElementwiseOperation_2 =
+            typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
+                InElementwiseOperation;
+        using AccElementwiseOperation_2 =
+            typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
+                AccElementwiseOperation;
+
+        using DeviceReduceInstPtr0 =
+            DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>;
+        using DeviceReduceInstPtr1 =
+            DeviceReducePtr<InElementwiseOperation_1, AccElementwiseOperation_1>;
+        using DeviceReduceInstPtr2 =
+            DeviceReducePtr<InElementwiseOperation_2, AccElementwiseOperation_2>;
+
+        std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
+        std::vector<DeviceReduceInstPtr1> reduce1_ptrs;
+        std::vector<DeviceReduceInstPtr2> reduce2_ptrs;
+
+        add_device_reduce_instance_threadwise<InDataType,
+                                              AccDataType,
+                                              OutDataType,
+                                              Rank,
+                                              ReduceDims_,
+                                              ReduceOpId,
+                                              NanOpt,
+                                              IndicesOpt>(reduce0_ptrs);
+
+        add_device_reduce_instance_blockwise<InDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             Rank,
+                                             ReduceDims_,
+                                             ReduceOpId,
+                                             NanOpt,
+                                             IndicesOpt>(reduce0_ptrs);
+
+        if constexpr(use_atomic_add)
+            add_device_reduce_instance_multiblock_atomic_add<InDataType,
+                                                             AccDataType,
+                                                             OutDataType,
+                                                             Rank,
+                                                             ReduceDims_,
+                                                             ReduceOpId,
+                                                             NanOpt,
+                                                             IndicesOpt>(reduce0_ptrs);
+        else
+            add_device_reduce_instance_multiblock_partial_reduce<InDataType,
+                                                                 AccDataType,
+                                                                 OutDataType,
+                                                                 Rank,
+                                                                 ReduceDims_,
+                                                                 ReduceOpId,
+                                                                 NanOpt,
+                                                                 IndicesOpt>(reduce1_ptrs);
+
+        // used for secondary reduction
+        if constexpr(!use_atomic_add)
+            add_device_reduce_instance_blockwise_second_call<AccDataType,
+                                                             AccDataType,
+                                                             OutDataType,
+                                                             Rank,
+                                                             ReduceDims_,
+                                                             ReduceOpId,
+                                                             NanOpt,
+                                                             IndicesOpt>(reduce2_ptrs);
+
+        if(reduce0_ptrs.empty() && reduce1_ptrs.empty())
+        {
+            throw std::runtime_error("Wrong! No device REDUCE instance found");
+        };
+
+        if(do_verification)
+        {
+            using hInType   = typename type_mapping<InDataType>::outDataType;
+            using hOutType  = typename type_mapping<OutDataType>::outDataType;
+            using hCompType = typename type_mapping<AccDataType>::outDataType;
+
+            ReductionHost<hInType, hCompType, hOutType, ReduceOpId, PropagateNan, NeedIndices>
+                hostReduce(in.mDesc, out_ref.mDesc, OuterDims, ReduceDims);
+
+            hostReduce.Run(alpha,
+                           reinterpret_cast<const hInType*>(in.mData.data()),
+                           beta,
+                           reinterpret_cast<hOutType*>(out_ref.mData.data()),
+                           out_indices_ref.mData.data());
+        };
+
+        const auto i_inLengths  = to_int_vector(inLengths);
+        const auto i_inStrides  = to_int_vector(inStrides);
+        const auto i_outLengths = to_int_vector(outLengths);
+        const auto i_outStrides = to_int_vector(outStrides);
+
+        for(auto& reduce_ptr : reduce0_ptrs)
+        {
+            auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths);
+
+            DeviceMem ws_dev(wsSizeInBytes);
+
+            auto argument_ptr = reduce_ptr->MakeArgumentPointer(
+                i_inLengths,
+                i_inStrides,
+                i_outLengths,
+                i_outStrides,
+                alpha,
+                beta,
+                in_dev.GetDeviceBuffer(),
+                out_dev.GetDeviceBuffer(),
+                out_indices_dev.GetDeviceBuffer(),
+                ws_dev.GetDeviceBuffer(),
+                InElementwiseOperation_0{static_cast<int32_t>(reduce_total_length)},
+                AccElementwiseOperation_0{static_cast<int32_t>(reduce_total_length)});
+
+            if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
+                continue;
+
+            std::string reduce_name = reduce_ptr->GetTypeString();
+
+            auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
+
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+
+            std::size_t num_bytes =
+                invariant_total_length * reduce_total_length * sizeof(InDataType) +
+                invariant_total_length * sizeof(OutDataType);
+
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
+                      << std::endl;
+
+            if(gb_per_sec > best_gb_per_sec)
+            {
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                out_dev.FromDevice(out.mData.data());
+                check_error(out_ref, out);
+
+                if(NeedIndices)
+                {
+                    out_indices_dev.FromDevice(out_indices.mData.data());
+                    check_indices(out_indices_ref, out_indices);
+                };
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "out_host  : ", out_ref.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "out_device: ", out.mData, ",") << std::endl;
+                };
+            };
+
+            if(do_dumpout)
+            {
+                dumpBufferToFile("dump_in.bin", in.mData.data(), in.mDesc.GetElementSize());
+                dumpBufferToFile("dump_out.bin", out.mData.data(), out.mDesc.GetElementSize());
+                dumpBufferToFile(
+                    "dump_out_host.bin", out_ref.mData.data(), out_ref.mDesc.GetElementSize());
+                if(NeedIndices)
+                {
+                    dumpBufferToFile("dump_indices.bin",
+                                     out_indices.mData.data(),
+                                     out_indices.mDesc.GetElementSize());
+                    dumpBufferToFile("dump_indices_host.bin",
+                                     out_indices_ref.mData.data(),
+                                     out_indices_ref.mDesc.GetElementSize());
+                };
+            };
+        };
+
+        for(auto& reduce_ptr : reduce1_ptrs)
+        {
+            auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths);
+
+            DeviceMem ws_dev(wsSizeInBytes);
+
+            auto argument_ptr = reduce_ptr->MakeArgumentPointer(
+                i_inLengths,
+                i_inStrides,
+                i_outLengths,
+                i_outStrides,
+                alpha,
+                beta,
+                in_dev.GetDeviceBuffer(),
+                out_dev.GetDeviceBuffer(),
+                out_indices_dev.GetDeviceBuffer(),
+                ws_dev.GetDeviceBuffer(),
+                InElementwiseOperation_1{static_cast<int32_t>(reduce_total_length)},
+                AccElementwiseOperation_1{static_cast<int32_t>(reduce_total_length)});
+
+            if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
+                continue;
+
+            std::string reduce_name = reduce_ptr->GetTypeString();
+
+            auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
+
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+
+            std::size_t num_bytes =
+                invariant_total_length * reduce_total_length * sizeof(InDataType) +
+                invariant_total_length * sizeof(OutDataType);
+
+            std::vector<int> inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get());
+            std::vector<int> inStrides2{inLengths2[1], 1};
+
+            for(auto& reduce2_ptr : reduce2_ptrs)
+            {
+                auto argument2_ptr = reduce2_ptr->MakeArgumentPointer(
+                    inLengths2,
+                    inStrides2,
+                    i_outLengths,
+                    i_outStrides,
+                    alpha,
+                    beta,
+                    ws_dev.GetDeviceBuffer(),
+                    out_dev.GetDeviceBuffer(),
+                    out_indices_dev.GetDeviceBuffer(),
+                    ws_dev.GetDeviceBuffer(),
+                    InElementwiseOperation_2{static_cast<int32_t>(reduce_total_length)},
+                    AccElementwiseOperation_2{static_cast<int32_t>(reduce_total_length)});
+
+                if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get()))
+                    continue;
+
+                std::string reduce2_name = reduce2_ptr->GetTypeString();
+
+                auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer();
+
+                float avg_time_2 = invoker2_ptr->Run(argument2_ptr.get(), nrepeat);
+
+                std::size_t num_bytes_2 =
+                    static_cast<size_t>(inLengths2[0]) * inLengths2[1] * sizeof(AccDataType);
+
+                float gb_per_sec = (num_bytes + num_bytes_2) / 1.E6 / (avg_time + avg_time_2);
+
+                std::cout << "Perf: " << (avg_time + avg_time_2) << " ms, " << gb_per_sec
+                          << " GB/s, " << reduce_name << " => " << reduce2_name << std::endl;
+
+                if(gb_per_sec > best_gb_per_sec)
+                {
+                    best_avg_time   = avg_time + avg_time_2;
+                    best_gb_per_sec = gb_per_sec;
+                }
+
+                if(do_verification)
+                {
+                    out_dev.FromDevice(out.mData.data());
+                    check_error(out_ref, out);
+
+                    if(NeedIndices)
+                    {
+                        out_indices_dev.FromDevice(out_indices.mData.data());
+                        check_indices(out_indices_ref, out_indices);
+                    };
+
+                    if(do_log)
+                    {
+                        LogRangeAsType<float>(std::cout << "out_host  : ", out_ref.mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(std::cout << "out_device: ", out.mData, ",")
+                            << std::endl;
+                    }
+                }
+
+                if(do_dumpout)
+                {
+                    dumpBufferToFile("dump_in.bin", in.mData.data(), in.mDesc.GetElementSize());
+                    dumpBufferToFile("dump_out.bin", out.mData.data(), out.mDesc.GetElementSize());
+                    dumpBufferToFile(
+                        "dump_out_host.bin", out_ref.mData.data(), out_ref.mDesc.GetElementSize());
+                    if(NeedIndices)
+                    {
+                        dumpBufferToFile("dump_indices.bin",
+                                         out_indices.mData.data(),
+                                         out_indices.mDesc.GetElementSize());
+                        dumpBufferToFile("dump_indices_host.bin",
+                                         out_indices_ref.mData.data(),
+                                         out_indices_ref.mDesc.GetElementSize());
+                    };
+                };
+            };
+        };
+
+        std::cout << "Best Perf: " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s"
+                  << std::endl;
+    }
+    else
+    {
+        std::cout << "The requested reduction operation is not supported, please check !!!"
+                  << std::endl;
+    };
+};
+
+template <typename InDataType, typename AccDataType, typename OutDataType>
+void profile_reduce_impl(bool do_verification,
+                         int init_method,
+                         bool do_log,
+                         bool do_dumpout,
+                         int nrepeat,
+                         const std::vector<size_t>& inLengths,
+                         const std::vector<int>& ReduceDims,
+                         ReduceTensorOp_t ReduceOpId,
+                         NanPropagation_t NanOpt,
+                         ReduceTensorIndices_t IndicesOpt,
+                         float alpha,
+                         float beta)
+{
+    bool matched = false;
+
+    using tuple_of_description_instances =
+        tensor_operation::device::device_reduce_instance::reduce_description_instances;
+
+    const auto tuple_object = tuple_of_description_instances{};
+
+    static_for<0, std::tuple_size<tuple_of_description_instances>::value, 1>{}([&](auto i) {
+        if(matched)
+            return;
+
+        using descType = remove_cvref_t<decltype(std::get<i>(tuple_object))>;
+
+        if(!description_match(
+               descType{}, inLengths.size(), ReduceDims, ReduceOpId, NanOpt, IndicesOpt))
+            return;
+
+        profile_reduce_impl_impl<InDataType,
+                                 AccDataType,
+                                 OutDataType,
+                                 descType::Rank_,
+                                 typename descType::ReduceDims_,
+                                 static_cast<ReduceTensorOp_t>(descType::ReduceOpId_),
+                                 static_cast<NanPropagation_t>(descType::NanOpt_),
+                                 static_cast<ReduceTensorIndices_t>(descType::IndicesOpt_)>(
+            do_verification, init_method, do_log, do_dumpout, nrepeat, inLengths, alpha, beta);
+
+        matched = true;
+    });
+};
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/README.md
+++ b/profiler/README.md