Merge develop

ab663329 · aska-0096 · 4fec5ad3 · 8a4253ba · 4fec5ad3 · 4fec5ad3
Commit ab663329 authored Nov 07, 2022 by aska-0096
8 changed files
--- a/test/softmax/test_softmax_fp16.cpp
+++ b/test/softmax/test_softmax_fp16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "gtest/gtest.h"
-#include "test_softmax_util.hpp"
-
-template <ck::index_t N>
-using I = ck::Number<N>;
-
-template <typename Tuple>
-class TestSoftmaxFP16 : public ck::TestSoftmax<Tuple>
-{
-};
-
-// clang-format off
-using KernelTypes = ::testing::Types<
-// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-    std::tuple<ck::half_t, float, float, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<4>>, // mixed precision
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<8>, I<8>>
-    >;
-// clang-format on
-TYPED_TEST_SUITE(TestSoftmaxFP16, KernelTypes);
-TYPED_TEST(TestSoftmaxFP16, Test_FP16) { this->Run(); }
--- a/test/softmax/test_softmax_fp32.cpp
+++ b/test/softmax/test_softmax_fp32.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "gtest/gtest.h"
-#include "test_softmax_util.hpp"
-
-template <ck::index_t N>
-using I = ck::Number<N>;
-
-template <typename Tuple>
-class TestSoftmaxFP32 : public ck::TestSoftmax<Tuple>
-{
-};
-
-// clang-format off
-using KernelTypes = ::testing::Types<
-// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-    std::tuple<float, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<8>>, // mixed precision
-    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>
-    >;
-// clang-format on
-TYPED_TEST_SUITE(TestSoftmaxFP32, KernelTypes);
-TYPED_TEST(TestSoftmaxFP32, Test_FP32) { this->Run(); }
--- a/test/softmax/test_softmax_int8.cpp
+++ b/test/softmax/test_softmax_int8.cpp
-#include "gtest/gtest.h"
-#include "test_softmax_util.hpp"
-
-template <ck::index_t N>
-using I = ck::Number<N>;
-
-template <typename Tuple>
-class TestSoftmaxINT8 : public ck::TestSoftmax<Tuple>
-{
-};
-
-// clang-format off
-using KernelTypes = ::testing::Types<
-// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<16>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<16>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<16>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<64>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<16>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<16>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<16>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<64>, I<1>, I<16>, I<16>>
-    >;
-// clang-format on
-TYPED_TEST_SUITE(TestSoftmaxINT8, KernelTypes);
-TYPED_TEST(TestSoftmaxINT8, Test_INT8) { this->Run(); }
--- a/test/softmax/test_softmax_interface.cpp
+++ b/test/softmax/test_softmax_interface.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <stdexcept>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test_softmax_util.hpp"
+
+class TestSoftmaxInterface : public ::testing::Test
+{
+    protected:
+    template <ck::index_t Rank, ck::index_t NumReduceDims>
+    using SoftmaxInstance =
+        ck::DeviceSoftmaxInstanceWrapper<Rank, NumReduceDims, 256, 1, 256, 1, 8, 1, 8, 8>;
+};
+
+TEST_F(TestSoftmaxInterface, IncorrectReduceDims)
+{
+    std::vector<ck::index_t> lengths{2, 128, 1536};
+    std::vector<ck::index_t> strides{128 * 1536, 1536, 1};
+
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, strides, {-1})), std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, strides, {3})), std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, strides, {0, 1})),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, strides, {})), std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 2>{}.IsSupported(lengths, strides, {2, -1})),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 2>{}.IsSupported(lengths, strides, {2, 4})),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 2>{}.IsSupported(lengths, strides, {2})), std::runtime_error);
+}
+
+TEST_F(TestSoftmaxInterface, IncorrectLengthsSize)
+{
+    std::vector<ck::index_t> lengths{128, 1536};
+    std::vector<ck::index_t> strides{128 * 1536, 1536, 1};
+    std::vector<ck::index_t> reduce_dims{2};
+
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported({128, 1536}, strides, reduce_dims)),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported({}, strides, reduce_dims)),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported({1, 8, 128, 1536}, strides, reduce_dims)),
+                 std::runtime_error);
+}
+
+TEST_F(TestSoftmaxInterface, IncorrectStridesSize)
+{
+    std::vector<ck::index_t> lengths{2, 128, 1536};
+    std::vector<ck::index_t> reduce_dims{2};
+
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, {1536, 1}, reduce_dims)),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, {}, reduce_dims)),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, {1, 2, 3, 4}, reduce_dims)),
+                 std::runtime_error);
+}
+
+TEST_F(TestSoftmaxInterface, UnsupportedLengths)
+{
+    using SoftmaxInstance1 = ck::DeviceSoftmaxInstanceWrapper<3, 1, 256, 1, 256, 1, 8, 1, 8, 4>;
+    EXPECT_FALSE(SoftmaxInstance1{}.IsSupported({2, 128, 1500}, {128 * 1500, 1500, 1}, {2}));
+    EXPECT_FALSE(SoftmaxInstance1{}.IsSupported({2, 127, 1536}, {127 * 1536, 1536, 1}, {2}));
+    EXPECT_FALSE(SoftmaxInstance1{}.IsSupported({2, 128, 1537}, {128 * 1537, 1537, 1}, {2}));
+
+    // Reduction of middle dimensions
+    using SoftmaxInstance2 = ck::DeviceSoftmaxInstanceWrapper<3, 3, 256, 8, 32, 8, 8, 0, 8, 4>;
+    EXPECT_FALSE(SoftmaxInstance2{}.IsSupported({2, 128, 1536}, {128 * 1536, 1536, 1}, {0, 1, 2}));
+
+    // Reduction of middle dimensions
+    using SoftmaxInstance3 = ck::DeviceSoftmaxInstanceWrapper<3, 1, 256, 8, 32, 8, 8, 0, 4, 8>;
+    EXPECT_FALSE(SoftmaxInstance3{}.IsSupported({2, 128, 1536}, {128 * 1536, 1536, 1}, {2}));
+    EXPECT_FALSE(SoftmaxInstance3{}.IsSupported({2, 128, 1537}, {128 * 1537, 1537, 1}, {1}));
+    EXPECT_FALSE(SoftmaxInstance3{}.IsSupported({2, 128, 1540}, {128 * 1540, 1540, 1}, {1}));
+    EXPECT_FALSE(SoftmaxInstance3{}.IsSupported({2, 127, 1536}, {127 * 1536, 1536, 1}, {1}));
+}
+
+TEST_F(TestSoftmaxInterface, UnsupportedInstance)
+{
+    // Instance with InSrcVectorDim = 1, can't reduce middle dims if in/out vec size != 1
+    using SoftmaxInstance1 = ck::DeviceSoftmaxInstanceWrapper<3, 1, 256, 8, 32, 1, 8, 1, 8, 8>;
+    EXPECT_FALSE(SoftmaxInstance1{}.IsSupported({2, 128, 1024}, {128 * 1024, 1024, 1}, {0}));
+}
--- a/test/softmax/test_softmax_rank3.cpp
+++ b/test/softmax/test_softmax_rank3.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <stdexcept>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test_softmax_util.hpp"
+
+template <ck::index_t N>
+using I = ck::Number<N>;
+
+using F16 = ck::half_t;
+using F32 = float;
+using I8  = int8_t;
+
+template <typename Tuple>
+class TestSoftmax : public ck::TestSoftmax<Tuple>
+{
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    //         InDataType, AccDataType, OutDataType, Rank
+    std::tuple<       F16,         F32,         F16,    I<3>>,
+    std::tuple<       F32,         F32,         F32,    I<3>>,
+    std::tuple<        I8,         F32,          I8,    I<3>>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestSoftmax, KernelTypes);
+
+#include "test_softmax_ut_cases.inc"
--- a/test/softmax/test_softmax_rank4.cpp
+++ b/test/softmax/test_softmax_rank4.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <stdexcept>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test_softmax_util.hpp"
+
+template <ck::index_t N>
+using I = ck::Number<N>;
+
+using F16 = ck::half_t;
+using F32 = float;
+using I8  = int8_t;
+
+template <typename Tuple>
+class TestSoftmax : public ck::TestSoftmax<Tuple>
+{
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    //         InDataType, AccDataType, OutDataType, Rank
+    std::tuple<       F16,         F32,         F16,    I<4>>,
+    std::tuple<       F32,         F32,         F32,    I<4>>,
+    std::tuple<        I8,         F32,          I8,    I<4>>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestSoftmax, KernelTypes);
+
+#include "test_softmax_ut_cases.inc"
--- a/test/softmax/test_softmax_ut_cases.inc
+++ b/test/softmax/test_softmax_ut_cases.inc
+#pragma once
+
+TYPED_TEST(TestSoftmax, ReduceOutermostDim)
+{
+    std::vector<ck::index_t> reduce_dims{this->Rank - 1};
+    this->Run(reduce_dims);
+}
+
+TYPED_TEST(TestSoftmax, ReduceMiddleDim)
+{
+    for(int dim = 0; dim < this->Rank - 1; ++dim)
+    {
+        std::vector<ck::index_t> reduce_dims{dim};
+        this->Run(reduce_dims);
+    }
+}
+
+TYPED_TEST(TestSoftmax, ReduceMultipleDimsWithOutermost)
+{
+    for(int dim = 0; dim < this->Rank - 1; ++dim)
+    {
+        std::vector<ck::index_t> reduce_dims{dim, this->Rank - 1};
+        this->Run(reduce_dims);
+    }
+}
+
+TYPED_TEST(TestSoftmax, ReduceMultipleMiddleDims)
+{
+    std::vector<ck::index_t> reduce_dims{0, 1};
+    if(this->Rank >= 3)
+    {
+        this->Run(reduce_dims);
+    }
+
+    if(this->Rank >= 4)
+    {
+        reduce_dims = std::vector<ck::index_t>{0, 2};
+        this->Run(reduce_dims);
+        reduce_dims = std::vector<ck::index_t>{0, 1, 2};
+        this->Run(reduce_dims);
+    }
+}
+
+TYPED_TEST(TestSoftmax, ReduceAllDims)
+{
+    std::vector<ck::index_t> reduce_dims(this->Rank);
+    std::iota(std::begin(reduce_dims), std::end(reduce_dims), 0);
+    this->Run(reduce_dims);
+}
+
+TYPED_TEST(TestSoftmax, ReduceOddLengths)
+{
+    this->in_lengths_ = {{3, 63, 1032}};
+    if(this->Rank >= 4)
+    {
+        this->in_lengths_ = {{1, 3, 63, 1032}};
+    }
+    this->Run({this->Rank - 1});
+    this->Run({this->Rank - 2});
+}
--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
@@ -3,19 +3,17 @@

 #pragma once

+#include <string>
+#include <sstream>
+#include <tuple>
 #include <vector>
-#include <iostream>
 #include <gtest/gtest.h>

 #include "ck/ck.hpp"
-#include "ck/utility/number.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+#include "include/ck/utility/data_type.hpp"
+#include "profiler/include/profile_softmax_impl.hpp"

 namespace ck {

@@ -35,126 +33,110 @@ template <typename Tuple>
 class TestSoftmax : public ::testing::Test
 {
    protected:
-    using InDataType                            = std::tuple_element_t<0, Tuple>;
-    using AccDataType                           = std::tuple_element_t<1, Tuple>;
-    using OutDataType                           = std::tuple_element_t<2, Tuple>;
-    static constexpr index_t Rank               = std::tuple_element_t<3, Tuple>{}.value;
-    static constexpr index_t NumReduceDim       = std::tuple_element_t<4, Tuple>{}.value;
-    static constexpr index_t BlockSize          = std::tuple_element_t<5, Tuple>{}.value;
-    static constexpr index_t MThreadClusterSize = std::tuple_element_t<6, Tuple>{}.value;
-    static constexpr index_t KThreadClusterSize = std::tuple_element_t<7, Tuple>{}.value;
-    static constexpr index_t MThreadSliceSize   = std::tuple_element_t<8, Tuple>{}.value;
-    static constexpr index_t KThreadSliceSize   = std::tuple_element_t<9, Tuple>{}.value;
-    static constexpr index_t InSrcVectorDim     = std::tuple_element_t<10, Tuple>{}.value;
-    static constexpr index_t InSrcVectorSize    = std::tuple_element_t<11, Tuple>{}.value;
-    static constexpr index_t OutDstVectorSize   = std::tuple_element_t<12, Tuple>{}.value;
-
-    using ReferenceInstance =
-        tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
-
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-    using DeviceInstance = tensor_operation::device::DeviceSoftmaxImpl<InDataType,
-                                                                       AccDataType,
-                                                                       OutDataType,
-                                                                       PassThrough,
-                                                                       PassThrough,
-                                                                       Rank,
-                                                                       NumReduceDim,
-                                                                       BlockSize,
-                                                                       MThreadClusterSize,
-                                                                       KThreadClusterSize,
-                                                                       MThreadSliceSize,
-                                                                       KThreadSliceSize,
-                                                                       InSrcVectorDim,
-                                                                       InSrcVectorSize,
-                                                                       OutDstVectorSize>;
-
-    TestSoftmax() : ref_instance_invoker_(ReferenceInstance{}.MakeInvoker()) {}
-
-    void RunSingle(std::vector<index_t> in_length, AccDataType alpha, AccDataType beta)
+    using InDataType              = std::tuple_element_t<0, Tuple>;
+    using AccDataType             = std::tuple_element_t<1, Tuple>;
+    using OutDataType             = std::tuple_element_t<2, Tuple>;
+    static constexpr index_t Rank = std::tuple_element_t<3, Tuple>{}.value;
+
+    public:
+    std::vector<std::vector<index_t>> in_lengths_ = {{2, 128, 1024}, {4, 16, 8448}, {128, 128, 64}};
+    std::vector<std::vector<AccDataType>> scales_ = {{2, 0}, {0, 2}, {2, 2}};
+    bool bench_                                   = false; // measure kernel performance
+    bool verify_                                  = true;
+
+    void SetUp() override
    {
-        std::vector<index_t> reduce_dims(NumReduceDim);
-        std::iota(reduce_dims.begin(), reduce_dims.end(), Rank - NumReduceDim);
-
-        Tensor<InDataType> in(in_length);
-        Tensor<OutDataType> out(in_length);
-
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
-
-        Tensor<OutDataType> out_ref(out);
-
-        DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
-        DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
-        in_dev.ToDevice(in.mData.data());
-        out_dev.ToDevice(out.mData.data());
-
-        std::vector<index_t> i_in_lengths(in.mDesc.GetLengths().begin(),
-                                          in.mDesc.GetLengths().end());
-        std::vector<index_t> i_in_strides(in.mDesc.GetStrides().begin(),
-                                          in.mDesc.GetStrides().end());
-
-        auto device_instance = DeviceInstance{};
-        auto argument_ptr    = device_instance.MakeArgumentPointer(i_in_lengths,
-                                                                i_in_strides,
-                                                                reduce_dims,
-                                                                &alpha,
-                                                                &beta,
-                                                                in_dev.GetDeviceBuffer(),
-                                                                out_dev.GetDeviceBuffer(),
-                                                                PassThrough{},
-                                                                PassThrough{});
-
-        if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+        if constexpr(Rank == 4)
        {
-            // std::cout << "Skipped due to unsupported argument: "
-            //           << "input lengths = [" << serialize_range(in_length) << "], "
-            //           << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
-            return;
+            in_lengths_ = std::vector<std::vector<index_t>>{
+                {1, 2, 128, 1024}, {2, 4, 16, 8448}, {1, 128, 128, 64}};
        }
+    }

-        auto invoker_ptr = device_instance.MakeInvokerPointer();
-        invoker_ptr->Run(argument_ptr.get());
-
-        ref_instance_invoker_.Run({in, out_ref, alpha, beta, reduce_dims});
-
-        out_dev.FromDevice(out.mData.data());
-
-        bool pass;
-
-        if(std::is_same<InDataType, int8_t>::value)
-        {
-            EXPECT_TRUE(pass = ck::utils::check_err(
-                            out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1));
-        }
-        else
-        {
-            EXPECT_TRUE(pass = ck::utils::check_err(out.mData, out_ref.mData));
-        }
+    void RunSingle(std::vector<index_t> in_length,
+                   std::vector<index_t> reduce_dims,
+                   AccDataType alpha,
+                   AccDataType beta)
+    {
+        int init_method = 1; // integer value initialization
+        bool log        = false;
+        std::vector<ck::index_t> strides; // intenionally empty, to get packed layout.
+        bool pass = ck::profiler::profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank>(
+            verify_, init_method, log, bench_, in_length, strides, reduce_dims, alpha, beta);
+        EXPECT_TRUE(pass);
+    }

-        if(!pass)
+    void Run(std::vector<index_t> reduce_dims = {})
+    {
+        if(reduce_dims.empty())
        {
-            FAIL() << "Failure in input lengths = [" << serialize_range(in_length) << "], "
-                   << "scaler = [" << alpha << ", " << beta << "].";
+            reduce_dims.push_back(Rank - 1);
        }
-    }

-    void Run()
-    {
        for(auto in_length : this->in_lengths_)
        {
            for(auto scale : this->scales_)
            {
-                this->RunSingle(in_length, scale[0], scale[1]);
+                this->RunSingle(in_length, reduce_dims, scale[0], scale[1]);
            }
        }
    }
+};

-    std::vector<std::vector<index_t>> in_lengths_ = {
-        {1, 8, 128}, {2, 128, 1024}, {3, 9, 1032}, {4, 4, 2048}, {8, 1, 8192}};
-    std::vector<std::vector<AccDataType>> scales_ = {{1, 0}, {1, 1}, {0, 1}, {2, 2}};
-
-    typename ReferenceInstance::Invoker ref_instance_invoker_;
+template <index_t Rank,
+          index_t NumReduceDim,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          index_t OutDstVectorSize>
+struct DeviceSoftmaxInstanceWrapper
+{
+    using F16  = half_t;
+    using F32  = float;
+    using Pass = tensor_operation::element_wise::PassThrough;
+
+    using InDataType   = F16;
+    using AccDataType  = F32;
+    using OutDataType  = F16;
+    using InElementOp  = Pass;
+    using AccElementOp = Pass;
+
+    using DeviceSoftmaxInstance = tensor_operation::device::DeviceSoftmaxImpl<InDataType,
+                                                                              AccDataType,
+                                                                              OutDataType,
+                                                                              InElementOp,
+                                                                              AccElementOp,
+                                                                              Rank,
+                                                                              NumReduceDim,
+                                                                              BlockSize,
+                                                                              MThreadClusterSize,
+                                                                              KThreadClusterSize,
+                                                                              MThreadSliceSize,
+                                                                              KThreadSliceSize,
+                                                                              InSrcVectorDim,
+                                                                              InSrcVectorSize,
+                                                                              OutDstVectorSize>;
+
+    bool IsSupported(const std::vector<index_t> in_lengths,
+                     const std::vector<index_t> in_strides,
+                     const std::vector<index_t> reduce_dims) const
+    {
+        auto softmax  = DeviceSoftmaxInstance{};
+        auto argument = softmax.MakeArgument(in_lengths,
+                                             in_strides,
+                                             reduce_dims,
+                                             1,       // alpha
+                                             1,       // beta
+                                             nullptr, // in_dev
+                                             nullptr, // in_out
+                                             Pass{},  // in elementwise op
+                                             Pass{}); // acc elementwise op
+        return softmax.IsSupportedArgument(argument);
+    }
 };
+
 } // namespace ck