Merge branch 'develop' into lwpck-739

72e0c1c5 · Rostyslav Geyyer · 898866e0 · f0c620c4 · 898866e0 · 72e0c1c5
Commit 72e0c1c5 authored Jun 19, 2023 by Rostyslav Geyyer
20 changed files
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <vector>
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-void add_device_softmax_f32_f32_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances)
-{
-    add_device_softmax_f32_f32_rank3_reduce1_instances(instances);
-    add_device_softmax_f32_f32_rank3_reduce2_instances(instances);
-    add_device_softmax_f32_f32_rank3_reduce3_instances(instances);
-}
-void add_device_softmax_f32_f32_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances)
-{
-    add_device_softmax_f32_f32_rank4_reduce1_instances(instances);
-    add_device_softmax_f32_f32_rank4_reduce2_instances(instances);
-    add_device_softmax_f32_f32_rank4_reduce3_instances(instances);
-    add_device_softmax_f32_f32_rank4_reduce4_instances(instances);
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
-static constexpr index_t RANK = 3;
 void add_device_softmax_f32_f32_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
 }
 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
-static constexpr index_t RANK = 3;
 void add_device_softmax_f32_f32_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 2>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
 }
 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
-static constexpr index_t RANK = 3;
 void add_device_softmax_f32_f32_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 3>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 3>{});
 }
 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
-static constexpr index_t RANK = 4;
 void add_device_softmax_f32_f32_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
 }
 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
-static constexpr index_t RANK = 4;
 void add_device_softmax_f32_f32_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 2>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});
 }
 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
-static constexpr index_t RANK = 4;
 void add_device_softmax_f32_f32_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 3>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{});
 }
 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
-static constexpr index_t RANK = 4;
 void add_device_softmax_f32_f32_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 4>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 4>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 4>{});
 }
 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <vector>
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-void add_device_softmax_i8_i8_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances)
-{
-    add_device_softmax_i8_i8_rank3_reduce1_instances(instances);
-    add_device_softmax_i8_i8_rank3_reduce2_instances(instances);
-    add_device_softmax_i8_i8_rank3_reduce3_instances(instances);
-}
-void add_device_softmax_i8_i8_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances)
-{
-    add_device_softmax_i8_i8_rank4_reduce1_instances(instances);
-    add_device_softmax_i8_i8_rank4_reduce2_instances(instances);
-    add_device_softmax_i8_i8_rank4_reduce3_instances(instances);
-    add_device_softmax_i8_i8_rank4_reduce4_instances(instances);
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <vector>
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-static constexpr index_t RANK = 3;
-void add_device_softmax_i8_i8_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <vector>
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-static constexpr index_t RANK = 3;
-void add_device_softmax_i8_i8_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <vector>
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-static constexpr index_t RANK = 3;
-void add_device_softmax_i8_i8_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <vector>
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-static constexpr index_t RANK = 4;
-void add_device_softmax_i8_i8_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <vector>
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-static constexpr index_t RANK = 4;
-void add_device_softmax_i8_i8_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <vector>
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-static constexpr index_t RANK = 4;
-void add_device_softmax_i8_i8_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <vector>
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-static constexpr index_t RANK = 4;
-void add_device_softmax_i8_i8_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 4>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/profiler/include/profiler/profile_softmax_impl.hpp
+++ b/profiler/include/profiler/profile_softmax_impl.hpp
@@ -40,7 +40,11 @@ template <> std::string type_to_string<int8_t>()  { return "int8"; }
 template <> std::string type_to_string<int32_t>() { return "int32"; }
 // clang-format on
-template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
+          index_t NumReduceDim>
 bool profile_softmax_impl(int do_verification,
                          int init_method,
                          bool do_log,
@@ -54,7 +58,13 @@ bool profile_softmax_impl(int do_verification,
    if(Rank != in_length.size())
    {
        throw std::runtime_error("Input tensor rank is different from template argument Rank!");
-    }
+    };
+    if(NumReduceDim != reduce_dims.size())
+    {
+        throw std::runtime_error(
+            "Input reduce_dims rank is different from template argument NumReduceDim!");
+    };
    Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
                                               : Tensor<InDataType>(in_length, in_strides);
@@ -92,8 +102,13 @@ bool profile_softmax_impl(int do_verification,
    // add device softmax instances
    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-    using DeviceOp    = tensor_operation::device::
+    using DeviceOp    = tensor_operation::device::DeviceSoftmax<InDataType,
-        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+                                                             AccDataType,
+                                                             OutDataType,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             Rank,
+                                                             NumReduceDim>;
    // get device op instances
    const auto instances = tensor_operation::device::instance::DeviceOperationInstanceFactory<
@@ -112,13 +127,6 @@ bool profile_softmax_impl(int do_verification,
    for(auto& inst_ptr : instances)
    {
-        // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
-        // problem to rank 4 kernel) other than invoking IsSupportedArgument()?
-        if(!(inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
-        {
-            continue;
-        }
        auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths,
                                                          in_tensor_strides,
                                                          reduce_dims,

--- a/profiler/src/profile_softmax.cpp
+++ b/profiler/src/profile_softmax.cpp
@@ -92,27 +92,76 @@ int profile_softmax(int argc, char* argv[])
    {
        if(data_type == SoftmaxDataType::F16_F16)
        {
-            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3>(do_verification,
+            if(reduce.size() == 1)
-                                                                                 init_method,
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 1>(
-                                                                                 do_log,
+                    do_verification,
-                                                                                 time_kernel,
+                    init_method,
-                                                                                 length,
+                    do_log,
-                                                                                 stride,
+                    time_kernel,
-                                                                                 reduce,
+                    length,
-                                                                                 double(alpha),
+                    stride,
-                                                                                 double(beta));
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 2>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 3>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
        }
        else if(data_type == SoftmaxDataType::F32_F32)
        {
-            ck::profiler::profile_softmax_impl<float, float, float, 3>(do_verification,
+            if(reduce.size() == 1)
-                                                                       init_method,
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 1>(do_verification,
-                                                                       do_log,
+                                                                              init_method,
-                                                                       time_kernel,
+                                                                              do_log,
-                                                                       length,
+                                                                              time_kernel,
-                                                                       stride,
+                                                                              length,
-                                                                       reduce,
+                                                                              stride,
-                                                                       double(alpha),
+                                                                              reduce,
-                                                                       double(beta));
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 2>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 3>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
        }
        else
        {
@@ -124,27 +173,97 @@ int profile_softmax(int argc, char* argv[])
    {
        if(data_type == SoftmaxDataType::F16_F16)
        {
-            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4>(do_verification,
+            if(reduce.size() == 1)
-                                                                                 init_method,
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 1>(
-                                                                                 do_log,
+                    do_verification,
-                                                                                 time_kernel,
+                    init_method,
-                                                                                 length,
+                    do_log,
-                                                                                 stride,
+                    time_kernel,
-                                                                                 reduce,
+                    length,
-                                                                                 double(alpha),
+                    stride,
-                                                                                 double(beta));
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 2>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 3>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 4)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 4>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
        }
        else if(data_type == SoftmaxDataType::F32_F32)
        {
-            ck::profiler::profile_softmax_impl<float, float, float, 4>(do_verification,
+            if(reduce.size() == 1)
-                                                                       init_method,
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 1>(do_verification,
-                                                                       do_log,
+                                                                              init_method,
-                                                                       time_kernel,
+                                                                              do_log,
-                                                                       length,
+                                                                              time_kernel,
-                                                                       stride,
+                                                                              length,
-                                                                       reduce,
+                                                                              stride,
-                                                                       double(alpha),
+                                                                              reduce,
-                                                                       double(beta));
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 2>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 3>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 4)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 4>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
        }
        else
        {

--- a/test/data_type/CMakeLists.txt
+++ b/test/data_type/CMakeLists.txt
@@ -2,3 +2,6 @@ if (USE_BITINT_EXTENSION_INT4)
  add_gtest_executable(test_int4 int4.cpp)
  target_link_libraries(test_int4 PRIVATE utility)
 endif()
+add_gtest_executable(test_fp8 fp8.cpp)
+target_link_libraries(test_fp8 PRIVATE utility)
--- a/test/data_type/fp8.cpp
+++ b/test/data_type/fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "gtest/gtest.h"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/type_convert.hpp"
+using ck::f8_convert_sr;
+using ck::f8_t;
+using ck::half_t;
+using ck::type_convert;
+TEST(FP8, NumericLimits)
+{
+    EXPECT_EQ(ck::NumericLimits<f8_t>::Min(), 0x08);
+    EXPECT_EQ(ck::NumericLimits<f8_t>::Max(), 0x77);
+    EXPECT_EQ(ck::NumericLimits<f8_t>::Lowest(), 0xF7);
+    EXPECT_EQ(ck::NumericLimits<f8_t>::QuietNaN(), 0x80);
+}
+TEST(FP8, ConvertFP32Nearest)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-6;
+    // convert 0 float to fp8 and back, check if holds
+    ASSERT_NEAR(0.0f, type_convert<float>(type_convert<f8_t>(0.0f)), abs_tol);
+    // convert minimal float to fp8 and back, check if holds
+    ASSERT_NEAR(std::numeric_limits<float>::min(),
+                type_convert<float>(type_convert<f8_t>(std::numeric_limits<float>::min())),
+                abs_tol);
+    // convert maximal f8_t to float and check if equal to 240.0
+    ASSERT_NEAR(240.0f, type_convert<float>(type_convert<f8_t>(240.0f)), abs_tol);
+    // convert maximal float to fp8 and back, check if clipped to 240.0
+    ASSERT_NEAR(240.0f,
+                type_convert<float>(type_convert<f8_t>(std::numeric_limits<float>::max())),
+                abs_tol);
+    // convert inf float to f8_t and check if it is qNan
+    ASSERT_NEAR(0x80, type_convert<f8_t>(std::numeric_limits<float>::infinity()), abs_tol);
+    // positive float value to fp8 and back, check if holds
+    float pos_float = 0.0078125f;
+    ASSERT_NEAR(pos_float, type_convert<float>(type_convert<f8_t>(pos_float)), abs_tol);
+    // negative float value to fp8 and back, check if holds
+    float neg_float = -0.0156250f;
+    ASSERT_NEAR(neg_float, type_convert<float>(type_convert<f8_t>(neg_float)), abs_tol);
+}
+TEST(FP8, ConvertFP32Stochastic)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-6;
+    // convert 0 float to fp8 and back, check if holds
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_sr<f8_t>(0.0f)), abs_tol);
+    // convert minimal float to fp8 and back, check if holds
+    ASSERT_NEAR(std::numeric_limits<float>::min(),
+                type_convert<float>(f8_convert_sr<f8_t>(std::numeric_limits<float>::min())),
+                abs_tol);
+    // convert maximal f8_t to float and check if equal to 240.0
+    ASSERT_NEAR(240.0f, type_convert<float>(f8_convert_sr<f8_t>(240.0f)), abs_tol);
+    // convert maximal float to fp8 and back, check if clipped to 240.0
+    ASSERT_NEAR(240.0f,
+                type_convert<float>(f8_convert_sr<f8_t>(std::numeric_limits<float>::max())),
+                abs_tol);
+    // convert inf float to f8_t and check if it is qNan
+    ASSERT_NEAR(0x80, f8_convert_sr<f8_t>(std::numeric_limits<float>::infinity()), abs_tol);
+    // positive float value to fp8 and back, check if holds
+    float pos_float = 0.0078125f;
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<f8_t>(pos_float)), abs_tol);
+    // negative float value to fp8 and back, check if holds
+    float neg_float = -0.0156250f;
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<f8_t>(neg_float)), abs_tol);
+}
+TEST(FP8, ConvertFP16Nearest)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-3;
+    // convert 0 fp16 to fp8 and back, check if holds
+    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(type_convert<f8_t>(half_t{0.0})), abs_tol);
+    // convert minimal fp16 to fp8 and back, check if holds
+    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
+                type_convert<half_t>(type_convert<f8_t>(ck::NumericLimits<half_t>::Min())),
+                abs_tol);
+    // convert maximal f8_t to fp16 and check if equal to 240.0
+    ASSERT_NEAR(half_t{240.0}, type_convert<half_t>(type_convert<f8_t>(half_t{240.0})), abs_tol);
+    // convert maximal fp16 to fp8 and back, check if clipped to 240.0
+    ASSERT_NEAR(half_t{240.0},
+                type_convert<half_t>(type_convert<f8_t>(ck::NumericLimits<half_t>::Max())),
+                abs_tol);
+    // convert QuietNaN fp16 to f8_t and check if it is QuietNaN
+    ASSERT_NEAR(0x80, type_convert<f8_t>(ck::NumericLimits<half_t>::QuietNaN()), abs_tol);
+    // positive fp16 value to fp8 and back, check if holds
+    half_t pos_half = half_t{0.0078125};
+    ASSERT_NEAR(pos_half, type_convert<half_t>(type_convert<f8_t>(pos_half)), abs_tol);
+    // negative fp16 value to fp8 and back, check if holds
+    half_t neg_half = half_t{-0.0156250};
+    ASSERT_NEAR(neg_half, type_convert<half_t>(type_convert<f8_t>(neg_half)), abs_tol);
+}
+TEST(FP8, ConvertFP16Stochastic)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-3;
+    // convert 0 fp16 to fp8 and back, check if holds
+    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_sr<f8_t>(half_t{0.0})), abs_tol);
+    // convert minimal fp16 to fp8 and back, check if holds
+    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
+                type_convert<half_t>(f8_convert_sr<f8_t>(ck::NumericLimits<half_t>::Min())),
+                abs_tol);
+    // convert maximal f8_t to fp16 and check if equal to 240.0
+    ASSERT_NEAR(half_t{240.0}, type_convert<half_t>(f8_convert_sr<f8_t>(half_t{240.0})), abs_tol);
+    // convert maximal fp16 to fp8 and back, check if clipped to 240.0
+    ASSERT_NEAR(half_t{240.0},
+                type_convert<half_t>(f8_convert_sr<f8_t>(ck::NumericLimits<half_t>::Max())),
+                abs_tol);
+    // convert QuietNaN fp16 to f8_t and check if it is QuietNaN
+    ASSERT_NEAR(0x80, f8_convert_sr<f8_t>(ck::NumericLimits<half_t>::QuietNaN()), abs_tol);
+    // positive fp16 value to fp8 and back, check if holds
+    half_t pos_half = half_t{0.0078125};
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<f8_t>(pos_half)), abs_tol);
+    // negative fp16 value to fp8 and back, check if holds
+    half_t neg_half = half_t{-0.0156250};
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<f8_t>(neg_half)), abs_tol);
+}