Merge branch 'develop' into gemm_layernorm_welford

24af0144 · Po Yen Chen · GitHub · 961f5e9e · b79bbbc2 · 24af0144
Unverified Commit 24af0144 authored Nov 12, 2022 by Po Yen Chen Committed by GitHub Nov 12, 2022
13 changed files
--- a/test/normalization/test_layernorm2d_fp16.cpp
+++ b/test/normalization/test_layernorm2d_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "gtest/gtest.h"
+#include "profiler/include/profile_layernorm_impl.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using ck::index_t;
+template <typename Tuple>
+class TestLayernorm2d : public ::testing::Test
+{
+    protected:
+    using XDataType     = std::tuple_element_t<0, Tuple>;
+    using GammaDataType = std::tuple_element_t<1, Tuple>;
+    using BetaDataType  = std::tuple_element_t<2, Tuple>;
+    using AccDataType   = std::tuple_element_t<3, Tuple>;
+    using YDataType     = std::tuple_element_t<4, Tuple>;
+    void Run()
+    {
+        // [N, D], reduce D
+        std::vector<std::vector<ck::index_t>> lengths = {
+            {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
+        for(auto length : lengths)
+        {
+            bool success = ck::profiler::profile_layernorm_impl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                AccDataType,
+                                                                YDataType,
+                                                                2>(true, 2, false, false, length);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+using KernelTypes = ::testing::Types<
+    // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
+    std::tuple<F16, F16, F16, F32, F16>>;
+TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes);
+TYPED_TEST(TestLayernorm2d, Test_FP16) { this->Run(); }
--- a/test/normalization/test_layernorm2d_fp32.cpp
+++ b/test/normalization/test_layernorm2d_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "gtest/gtest.h"
+#include "profiler/include/profile_layernorm_impl.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using ck::index_t;
+template <typename Tuple>
+class TestLayernorm2d : public ::testing::Test
+{
+    protected:
+    using XDataType     = std::tuple_element_t<0, Tuple>;
+    using GammaDataType = std::tuple_element_t<1, Tuple>;
+    using BetaDataType  = std::tuple_element_t<2, Tuple>;
+    using AccDataType   = std::tuple_element_t<3, Tuple>;
+    using YDataType     = std::tuple_element_t<4, Tuple>;
+    void Run()
+    {
+        // [N, D], reduce D
+        std::vector<std::vector<ck::index_t>> lengths = {
+            {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
+        for(auto length : lengths)
+        {
+            bool success = ck::profiler::profile_layernorm_impl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                AccDataType,
+                                                                YDataType,
+                                                                2>(true, 2, false, false, length);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+using KernelTypes = ::testing::Types<
+    // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
+    std::tuple<F32, F32, F32, F32, F32>>;
+TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes);
+TYPED_TEST(TestLayernorm2d, Test_FP32) { this->Run(); }
--- a/test/reference_conv_fwd/reference_conv_fwd.cpp
+++ b/test/reference_conv_fwd/reference_conv_fwd.cpp
@@ -12,6 +12,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/fill.hpp"
 #include "ck/library/utility/host_tensor.hpp"
@@ -54,7 +55,7 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParam& conv_param,
    fill_input_op(input.begin(), input.end());
    fill_weights_op(weights.begin(), weights.end());
-    std::fill(host_output.begin(), host_output.end(), OutDataType(0.f));
+    ck::ranges::fill<OutDataType>(host_output, 0.f);
    auto ref_conv     = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
                                                                 InDataType,
@@ -122,7 +123,7 @@ TEST(ReferenceConvolutionFWD, Conv2DGNHWC)
                                508.5};
    EXPECT_TRUE(ck::utils::check_err(
        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
-    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error: incorrect results!"));
 }
 TEST(ReferenceConvolutionFWD, Conv2DGNHWCStridesDilationsPadding)
@@ -149,7 +150,7 @@ TEST(ReferenceConvolutionFWD, Conv2DGNHWCStridesDilationsPadding)
        1323., 1323., 2002.5, 2002.5, 2038.5, 2038.5, 2074.5, 2074.5, 2110.5, 2110.5};
    EXPECT_TRUE(ck::utils::check_err(
        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
-    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error: incorrect results!"));
 }
 TEST(ReferenceConvolutionFWD, Conv1DGNWC)
@@ -178,7 +179,7 @@ TEST(ReferenceConvolutionFWD, Conv1DGNWC)
    std::vector<float> ref_data{7.5, 13.5, 19.5, 25.5};
    EXPECT_TRUE(ck::utils::check_err(
        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
-    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error: incorrect results!"));
 }
 TEST(ReferenceConvolutionFWD, Conv1DGNWCStridesDilationsPadding)
@@ -207,7 +208,7 @@ TEST(ReferenceConvolutionFWD, Conv1DGNWCStridesDilationsPadding)
    std::vector<float> ref_data{9., 9., 19.5, 19.5, 31.5, 31.5, 43.5, 43.5, 55.5, 55.5};
    EXPECT_TRUE(ck::utils::check_err(
        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
-    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error: incorrect results!"));
 }
 TEST(ReferenceConvolutionFWD, Conv1DGNWCSameOutputSize)
@@ -301,7 +302,7 @@ TEST(ReferenceConvolutionFWD, Conv1DGNWCSameOutputSize)
        49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4};
    EXPECT_TRUE(ck::utils::check_err(
        out_tensor2.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
-    EXPECT_TRUE(ck::utils::check_err(out_tensor2.mData, ref_data, "Error: incorrect results!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor2, ref_data, "Error: incorrect results!"));
 }
 #endif
@@ -340,8 +341,7 @@ TEST(ReferenceConvolutionFWD, Conv3DGNCDHW)
    EXPECT_TRUE(ck::utils::check_err(out_tensor.mDesc.GetLengths(),
                                     ref_dims,
                                     "Error [case 1]: wrong output tensor dimensions!"));
-    EXPECT_TRUE(
+    EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error [case 1]: incorrect results!"));
-        ck::utils::check_err(out_tensor.mData, ref_data, "Error [case 1]: incorrect results!"));
 }
 TEST(ReferenceConvolutionFWD, Conv3DGNCDHWStridesDilations)
@@ -388,5 +388,5 @@ TEST(ReferenceConvolutionFWD, Conv3DGNCDHWStridesDilations)
                                     ref_dims,
                                     "Error [case 2]: wrong output tensor dimensions!"));
    EXPECT_TRUE(ck::utils::check_err(
-        out_tensor.mData, ref_data, "Error [case 2]: incorrect results!", 1e-4f, 1e-6f));
+        out_tensor, ref_data, "Error [case 2]: incorrect results!", 1e-4f, 1e-6f));
 }
--- a/test/softmax/CMakeLists.txt
+++ b/test/softmax/CMakeLists.txt
 add_custom_target(test_softmax)
-add_gtest_executable(test_softmax_fp32 test_softmax_fp32.cpp)
+add_gtest_executable(test_softmax_rank3 test_softmax_rank3.cpp)
-add_gtest_executable(test_softmax_fp16 test_softmax_fp16.cpp)
+add_gtest_executable(test_softmax_rank4 test_softmax_rank4.cpp)
-add_gtest_executable(test_softmax_int8 test_softmax_int8.cpp)
+add_gtest_executable(test_softmax_interface test_softmax_interface.cpp)
-target_link_libraries(test_softmax_fp32 PRIVATE utility)
+target_link_libraries(test_softmax_rank3 PRIVATE utility device_softmax_instance)
-target_link_libraries(test_softmax_fp16 PRIVATE utility)
+target_link_libraries(test_softmax_rank4 PRIVATE utility device_softmax_instance)
-target_link_libraries(test_softmax_int8 PRIVATE utility)
+target_link_libraries(test_softmax_interface PRIVATE utility device_softmax_instance)
-add_dependencies(test_softmax test_softmax_fp32)
+add_dependencies(test_softmax test_softmax_rank3)
-add_dependencies(test_softmax test_softmax_fp16)
+add_dependencies(test_softmax test_softmax_rank4)
-add_dependencies(test_softmax test_softmax_int8)
+add_dependencies(test_softmax test_softmax_interface)
--- a/test/softmax/test_softmax_fp16.cpp
+++ b/test/softmax/test_softmax_fp16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "test_softmax_util.hpp"
-template <ck::index_t N>
-using I = ck::Number<N>;
-template <typename Tuple>
-class TestSoftmaxFP16 : public ck::TestSoftmax<Tuple>
-{
-};
-// clang-format off
-using KernelTypes = ::testing::Types<
-// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-    std::tuple<ck::half_t, float, float, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<4>>, // mixed precision
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<8>, I<8>>
-    >;
-// clang-format on
-TYPED_TEST_SUITE(TestSoftmaxFP16, KernelTypes);
-TYPED_TEST(TestSoftmaxFP16, Test_FP16) { this->Run(); }
--- a/test/softmax/test_softmax_fp32.cpp
+++ b/test/softmax/test_softmax_fp32.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "test_softmax_util.hpp"
-template <ck::index_t N>
-using I = ck::Number<N>;
-template <typename Tuple>
-class TestSoftmaxFP32 : public ck::TestSoftmax<Tuple>
-{
-};
-// clang-format off
-using KernelTypes = ::testing::Types<
-// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-    std::tuple<float, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<8>>, // mixed precision
-    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>
-    >;
-// clang-format on
-TYPED_TEST_SUITE(TestSoftmaxFP32, KernelTypes);
-TYPED_TEST(TestSoftmaxFP32, Test_FP32) { this->Run(); }
--- a/test/softmax/test_softmax_int8.cpp
+++ b/test/softmax/test_softmax_int8.cpp
-#include "gtest/gtest.h"
-#include "test_softmax_util.hpp"
-template <ck::index_t N>
-using I = ck::Number<N>;
-template <typename Tuple>
-class TestSoftmaxINT8 : public ck::TestSoftmax<Tuple>
-{
-};
-// clang-format off
-using KernelTypes = ::testing::Types<
-// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<16>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<16>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<16>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<64>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<16>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<16>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<16>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<16>, I<16>>,
-    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<64>, I<1>, I<16>, I<16>>
-    >;
-// clang-format on
-TYPED_TEST_SUITE(TestSoftmaxINT8, KernelTypes);
-TYPED_TEST(TestSoftmaxINT8, Test_INT8) { this->Run(); }
--- a/test/softmax/test_softmax_interface.cpp
+++ b/test/softmax/test_softmax_interface.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <stdexcept>
+#include <vector>
+#include "gtest/gtest.h"
+#include "test_softmax_util.hpp"
+class TestSoftmaxInterface : public ::testing::Test
+{
+    protected:
+    template <ck::index_t Rank, ck::index_t NumReduceDims>
+    using SoftmaxInstance =
+        ck::DeviceSoftmaxInstanceWrapper<Rank, NumReduceDims, 256, 1, 256, 1, 8, 1, 8, 8>;
+};
+TEST_F(TestSoftmaxInterface, IncorrectReduceDims)
+{
+    std::vector<ck::index_t> lengths{2, 128, 1536};
+    std::vector<ck::index_t> strides{128 * 1536, 1536, 1};
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, strides, {-1})), std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, strides, {3})), std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, strides, {0, 1})),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, strides, {})), std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 2>{}.IsSupported(lengths, strides, {2, -1})),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 2>{}.IsSupported(lengths, strides, {2, 4})),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 2>{}.IsSupported(lengths, strides, {2})), std::runtime_error);
+}
+TEST_F(TestSoftmaxInterface, IncorrectLengthsSize)
+{
+    std::vector<ck::index_t> lengths{128, 1536};
+    std::vector<ck::index_t> strides{128 * 1536, 1536, 1};
+    std::vector<ck::index_t> reduce_dims{2};
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported({128, 1536}, strides, reduce_dims)),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported({}, strides, reduce_dims)),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported({1, 8, 128, 1536}, strides, reduce_dims)),
+                 std::runtime_error);
+}
+TEST_F(TestSoftmaxInterface, IncorrectStridesSize)
+{
+    std::vector<ck::index_t> lengths{2, 128, 1536};
+    std::vector<ck::index_t> reduce_dims{2};
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, {1536, 1}, reduce_dims)),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, {}, reduce_dims)),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, {1, 2, 3, 4}, reduce_dims)),
+                 std::runtime_error);
+}
+TEST_F(TestSoftmaxInterface, UnsupportedLengths)
+{
+    using SoftmaxInstance1 = ck::DeviceSoftmaxInstanceWrapper<3, 1, 256, 1, 256, 1, 8, 1, 8, 4>;
+    EXPECT_FALSE(SoftmaxInstance1{}.IsSupported({2, 128, 1500}, {128 * 1500, 1500, 1}, {2}));
+    EXPECT_FALSE(SoftmaxInstance1{}.IsSupported({2, 127, 1536}, {127 * 1536, 1536, 1}, {2}));
+    EXPECT_FALSE(SoftmaxInstance1{}.IsSupported({2, 128, 1537}, {128 * 1537, 1537, 1}, {2}));
+    // Reduction of middle dimensions
+    using SoftmaxInstance2 = ck::DeviceSoftmaxInstanceWrapper<3, 3, 256, 8, 32, 8, 8, 0, 8, 4>;
+    EXPECT_FALSE(SoftmaxInstance2{}.IsSupported({2, 128, 1536}, {128 * 1536, 1536, 1}, {0, 1, 2}));
+    // Reduction of middle dimensions
+    using SoftmaxInstance3 = ck::DeviceSoftmaxInstanceWrapper<3, 1, 256, 8, 32, 8, 8, 0, 4, 8>;
+    EXPECT_FALSE(SoftmaxInstance3{}.IsSupported({2, 128, 1536}, {128 * 1536, 1536, 1}, {2}));
+    EXPECT_FALSE(SoftmaxInstance3{}.IsSupported({2, 128, 1537}, {128 * 1537, 1537, 1}, {1}));
+    EXPECT_FALSE(SoftmaxInstance3{}.IsSupported({2, 128, 1540}, {128 * 1540, 1540, 1}, {1}));
+    EXPECT_FALSE(SoftmaxInstance3{}.IsSupported({2, 127, 1536}, {127 * 1536, 1536, 1}, {1}));
+}
+TEST_F(TestSoftmaxInterface, UnsupportedInstance)
+{
+    // Instance with InSrcVectorDim = 1, can't reduce middle dims if in/out vec size != 1
+    using SoftmaxInstance1 = ck::DeviceSoftmaxInstanceWrapper<3, 1, 256, 8, 32, 1, 8, 1, 8, 8>;
+    EXPECT_FALSE(SoftmaxInstance1{}.IsSupported({2, 128, 1024}, {128 * 1024, 1024, 1}, {0}));
+}
--- a/test/softmax/test_softmax_rank3.cpp
+++ b/test/softmax/test_softmax_rank3.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <algorithm>
+#include <stdexcept>
+#include <vector>
+#include "gtest/gtest.h"
+#include "test_softmax_util.hpp"
+template <ck::index_t N>
+using I = ck::Number<N>;
+using F16 = ck::half_t;
+using F32 = float;
+using I8  = int8_t;
+template <typename Tuple>
+class TestSoftmax : public ck::TestSoftmax<Tuple>
+{
+};
+// clang-format off
+using KernelTypes = ::testing::Types<
+    //         InDataType, AccDataType, OutDataType, Rank
+    std::tuple<       F16,         F32,         F16,    I<3>>,
+    std::tuple<       F32,         F32,         F32,    I<3>>,
+    std::tuple<        I8,         F32,          I8,    I<3>>
+    >;
+// clang-format on
+TYPED_TEST_SUITE(TestSoftmax, KernelTypes);
+#include "test_softmax_ut_cases.inc"
--- a/test/softmax/test_softmax_rank4.cpp
+++ b/test/softmax/test_softmax_rank4.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <algorithm>
+#include <stdexcept>
+#include <vector>
+#include "gtest/gtest.h"
+#include "test_softmax_util.hpp"
+template <ck::index_t N>
+using I = ck::Number<N>;
+using F16 = ck::half_t;
+using F32 = float;
+using I8  = int8_t;
+template <typename Tuple>
+class TestSoftmax : public ck::TestSoftmax<Tuple>
+{
+};
+// clang-format off
+using KernelTypes = ::testing::Types<
+    //         InDataType, AccDataType, OutDataType, Rank
+    std::tuple<       F16,         F32,         F16,    I<4>>,
+    std::tuple<       F32,         F32,         F32,    I<4>>,
+    std::tuple<        I8,         F32,          I8,    I<4>>
+    >;
+// clang-format on
+TYPED_TEST_SUITE(TestSoftmax, KernelTypes);
+#include "test_softmax_ut_cases.inc"
--- a/test/softmax/test_softmax_ut_cases.inc
+++ b/test/softmax/test_softmax_ut_cases.inc
+#pragma once
+TYPED_TEST(TestSoftmax, ReduceOutermostDim)
+{
+    std::vector<ck::index_t> reduce_dims{this->Rank - 1};
+    this->Run(reduce_dims);
+}
+TYPED_TEST(TestSoftmax, ReduceMiddleDim)
+{
+    for(int dim = 0; dim < this->Rank - 1; ++dim)
+    {
+        std::vector<ck::index_t> reduce_dims{dim};
+        this->Run(reduce_dims);
+    }
+}
+TYPED_TEST(TestSoftmax, ReduceMultipleDimsWithOutermost)
+{
+    for(int dim = 0; dim < this->Rank - 1; ++dim)
+    {
+        std::vector<ck::index_t> reduce_dims{dim, this->Rank - 1};
+        this->Run(reduce_dims);
+    }
+}
+TYPED_TEST(TestSoftmax, ReduceMultipleMiddleDims)
+{
+    std::vector<ck::index_t> reduce_dims{0, 1};
+    if(this->Rank >= 3)
+    {
+        this->Run(reduce_dims);
+    }
+    if(this->Rank >= 4)
+    {
+        reduce_dims = std::vector<ck::index_t>{0, 2};
+        this->Run(reduce_dims);
+        reduce_dims = std::vector<ck::index_t>{0, 1, 2};
+        this->Run(reduce_dims);
+    }
+}
+TYPED_TEST(TestSoftmax, ReduceAllDims)
+{
+    std::vector<ck::index_t> reduce_dims(this->Rank);
+    std::iota(std::begin(reduce_dims), std::end(reduce_dims), 0);
+    this->Run(reduce_dims);
+}
+TYPED_TEST(TestSoftmax, ReduceOddLengths)
+{
+    this->in_lengths_ = {{3, 63, 1032}};
+    if(this->Rank >= 4)
+    {
+        this->in_lengths_ = {{1, 3, 63, 1032}};
+    }
+    this->Run({this->Rank - 1});
+    this->Run({this->Rank - 2});
+}
--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
@@ -3,19 +3,17 @@
 #pragma once
+#include <string>
+#include <sstream>
+#include <tuple>
 #include <vector>
-#include <iostream>
 #include <gtest/gtest.h>
 #include "ck/ck.hpp"
-#include "ck/utility/number.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "include/ck/utility/data_type.hpp"
-#include "ck/library/utility/check_err.hpp"
+#include "profiler/include/profile_softmax_impl.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
 namespace ck {
@@ -35,126 +33,110 @@ template <typename Tuple>
 class TestSoftmax : public ::testing::Test
 {
    protected:
-    using InDataType                            = std::tuple_element_t<0, Tuple>;
+    using InDataType              = std::tuple_element_t<0, Tuple>;
-    using AccDataType                           = std::tuple_element_t<1, Tuple>;
+    using AccDataType             = std::tuple_element_t<1, Tuple>;
-    using OutDataType                           = std::tuple_element_t<2, Tuple>;
+    using OutDataType             = std::tuple_element_t<2, Tuple>;
-    static constexpr index_t Rank               = std::tuple_element_t<3, Tuple>{}.value;
+    static constexpr index_t Rank = std::tuple_element_t<3, Tuple>{}.value;
-    static constexpr index_t NumReduceDim       = std::tuple_element_t<4, Tuple>{}.value;
-    static constexpr index_t BlockSize          = std::tuple_element_t<5, Tuple>{}.value;
+    public:
-    static constexpr index_t MThreadClusterSize = std::tuple_element_t<6, Tuple>{}.value;
+    std::vector<std::vector<index_t>> in_lengths_ = {{2, 128, 1024}, {4, 16, 8448}, {128, 128, 64}};
-    static constexpr index_t KThreadClusterSize = std::tuple_element_t<7, Tuple>{}.value;
+    std::vector<std::vector<AccDataType>> scales_ = {{2, 0}, {0, 2}, {2, 2}};
-    static constexpr index_t MThreadSliceSize   = std::tuple_element_t<8, Tuple>{}.value;
+    bool bench_                                   = false; // measure kernel performance
-    static constexpr index_t KThreadSliceSize   = std::tuple_element_t<9, Tuple>{}.value;
+    bool verify_                                  = true;
-    static constexpr index_t InSrcVectorDim     = std::tuple_element_t<10, Tuple>{}.value;
-    static constexpr index_t InSrcVectorSize    = std::tuple_element_t<11, Tuple>{}.value;
+    void SetUp() override
-    static constexpr index_t OutDstVectorSize   = std::tuple_element_t<12, Tuple>{}.value;
-    using ReferenceInstance =
-        tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-    using DeviceInstance = tensor_operation::device::DeviceSoftmaxImpl<InDataType,
-                                                                       AccDataType,
-                                                                       OutDataType,
-                                                                       PassThrough,
-                                                                       PassThrough,
-                                                                       Rank,
-                                                                       NumReduceDim,
-                                                                       BlockSize,
-                                                                       MThreadClusterSize,
-                                                                       KThreadClusterSize,
-                                                                       MThreadSliceSize,
-                                                                       KThreadSliceSize,
-                                                                       InSrcVectorDim,
-                                                                       InSrcVectorSize,
-                                                                       OutDstVectorSize>;
-    TestSoftmax() : ref_instance_invoker_(ReferenceInstance{}.MakeInvoker()) {}
-    void RunSingle(std::vector<index_t> in_length, AccDataType alpha, AccDataType beta)
    {
-        std::vector<index_t> reduce_dims(NumReduceDim);
+        if constexpr(Rank == 4)
-        std::iota(reduce_dims.begin(), reduce_dims.end(), Rank - NumReduceDim);
-        Tensor<InDataType> in(in_length);
-        Tensor<OutDataType> out(in_length);
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
-        Tensor<OutDataType> out_ref(out);
-        DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
-        DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
-        in_dev.ToDevice(in.mData.data());
-        out_dev.ToDevice(out.mData.data());
-        std::vector<index_t> i_in_lengths(in.mDesc.GetLengths().begin(),
-                                          in.mDesc.GetLengths().end());
-        std::vector<index_t> i_in_strides(in.mDesc.GetStrides().begin(),
-                                          in.mDesc.GetStrides().end());
-        auto device_instance = DeviceInstance{};
-        auto argument_ptr    = device_instance.MakeArgumentPointer(i_in_lengths,
-                                                                i_in_strides,
-                                                                reduce_dims,
-                                                                &alpha,
-                                                                &beta,
-                                                                in_dev.GetDeviceBuffer(),
-                                                                out_dev.GetDeviceBuffer(),
-                                                                PassThrough{},
-                                                                PassThrough{});
-        if(!device_instance.IsSupportedArgument(argument_ptr.get()))
        {
-            // std::cout << "Skipped due to unsupported argument: "
+            in_lengths_ = std::vector<std::vector<index_t>>{
-            //           << "input lengths = [" << serialize_range(in_length) << "], "
+                {1, 2, 128, 1024}, {2, 4, 16, 8448}, {1, 128, 128, 64}};
-            //           << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
-            return;
        }
+    }
-        auto invoker_ptr = device_instance.MakeInvokerPointer();
+    void RunSingle(std::vector<index_t> in_length,
-        invoker_ptr->Run(argument_ptr.get());
+                   std::vector<index_t> reduce_dims,
+                   AccDataType alpha,
-        ref_instance_invoker_.Run({in, out_ref, alpha, beta, reduce_dims});
+                   AccDataType beta)
+    {
-        out_dev.FromDevice(out.mData.data());
+        int init_method = 1; // integer value initialization
+        bool log        = false;
-        bool pass;
+        std::vector<ck::index_t> strides; // intenionally empty, to get packed layout.
+        bool pass = ck::profiler::profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank>(
-        if(std::is_same<InDataType, int8_t>::value)
+            verify_, init_method, log, bench_, in_length, strides, reduce_dims, alpha, beta);
-        {
+        EXPECT_TRUE(pass);
-            EXPECT_TRUE(pass = ck::utils::check_err(
+    }
-                            out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1));
-        }
-        else
-        {
-            EXPECT_TRUE(pass = ck::utils::check_err(out.mData, out_ref.mData));
-        }
-        if(!pass)
+    void Run(std::vector<index_t> reduce_dims = {})
+    {
+        if(reduce_dims.empty())
        {
-            FAIL() << "Failure in input lengths = [" << serialize_range(in_length) << "], "
+            reduce_dims.push_back(Rank - 1);
-                   << "scaler = [" << alpha << ", " << beta << "].";
        }
-    }
-    void Run()
-    {
        for(auto in_length : this->in_lengths_)
        {
            for(auto scale : this->scales_)
            {
-                this->RunSingle(in_length, scale[0], scale[1]);
+                this->RunSingle(in_length, reduce_dims, scale[0], scale[1]);
            }
        }
    }
+};
-    std::vector<std::vector<index_t>> in_lengths_ = {
+template <index_t Rank,
-        {1, 8, 128}, {2, 128, 1024}, {3, 9, 1032}, {4, 4, 2048}, {8, 1, 8192}};
+          index_t NumReduceDim,
-    std::vector<std::vector<AccDataType>> scales_ = {{1, 0}, {1, 1}, {0, 1}, {2, 2}};
+          index_t BlockSize,
+          index_t MThreadClusterSize,
-    typename ReferenceInstance::Invoker ref_instance_invoker_;
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          index_t OutDstVectorSize>
+struct DeviceSoftmaxInstanceWrapper
+{
+    using F16  = half_t;
+    using F32  = float;
+    using Pass = tensor_operation::element_wise::PassThrough;
+    using InDataType   = F16;
+    using AccDataType  = F32;
+    using OutDataType  = F16;
+    using InElementOp  = Pass;
+    using AccElementOp = Pass;
+    using DeviceSoftmaxInstance = tensor_operation::device::DeviceSoftmaxImpl<InDataType,
+                                                                              AccDataType,
+                                                                              OutDataType,
+                                                                              InElementOp,
+                                                                              AccElementOp,
+                                                                              Rank,
+                                                                              NumReduceDim,
+                                                                              BlockSize,
+                                                                              MThreadClusterSize,
+                                                                              KThreadClusterSize,
+                                                                              MThreadSliceSize,
+                                                                              KThreadSliceSize,
+                                                                              InSrcVectorDim,
+                                                                              InSrcVectorSize,
+                                                                              OutDstVectorSize>;
+    bool IsSupported(const std::vector<index_t> in_lengths,
+                     const std::vector<index_t> in_strides,
+                     const std::vector<index_t> reduce_dims) const
+    {
+        auto softmax  = DeviceSoftmaxInstance{};
+        auto argument = softmax.MakeArgument(in_lengths,
+                                             in_strides,
+                                             reduce_dims,
+                                             1,       // alpha
+                                             1,       // beta
+                                             nullptr, // in_dev
+                                             nullptr, // in_out
+                                             Pass{},  // in elementwise op
+                                             Pass{}); // acc elementwise op
+        return softmax.IsSupportedArgument(argument);
+    }
 };
 } // namespace ck
--- a/test/space_filling_curve/space_filling_curve.cpp
+++ b/test/space_filling_curve/space_filling_curve.cpp
@@ -12,28 +12,91 @@
 using namespace ck;
-void traverse_using_space_filling_curve();
+void traverse_using_space_filling_curve_linear();
+void traverse_using_space_filling_curve_snakecurved();
 int main(int argc, char** argv)
 {
    (void)argc;
    (void)argv;
-    traverse_using_space_filling_curve();
+    traverse_using_space_filling_curve_linear();
+    traverse_using_space_filling_curve_snakecurved();
    return 0;
 }
-void traverse_using_space_filling_curve()
+void traverse_using_space_filling_curve_linear()
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
-    using TensorLengths     = Sequence<16, 10, 9>;
+    using TensorLengths    = Sequence<3, 2, 2>;
-    using DimAccessOrder    = Sequence<2, 0, 1>;
+    using DimAccessOrder   = Sequence<2, 0, 1>;
-    using ScalarsPerAccess  = Sequence<4, 2, 3>;
+    using ScalarsPerAccess = Sequence<1, 1, 1>;
-    using SpaceFillingCurve = SpaceFillingCurve<TensorLengths, DimAccessOrder, ScalarsPerAccess>;
+    using SpaceFillingCurve =
+        SpaceFillingCurve<TensorLengths, DimAccessOrder, ScalarsPerAccess, false>;
+    constexpr auto expected = make_tuple(make_tuple(0, 0, 0),
+                                         make_tuple(0, 1, 0),
+                                         make_tuple(1, 0, 0),
+                                         make_tuple(1, 1, 0),
+                                         make_tuple(2, 0, 0),
+                                         make_tuple(2, 1, 0),
+                                         make_tuple(0, 0, 1),
+                                         make_tuple(0, 1, 1),
+                                         make_tuple(1, 0, 1),
+                                         make_tuple(1, 1, 1),
+                                         make_tuple(2, 0, 1),
+                                         make_tuple(2, 1, 1));
+    constexpr index_t num_access = SpaceFillingCurve::GetNumOfAccess();
+    static_assert(num_access == reduce_on_sequence(TensorLengths{} / ScalarsPerAccess{},
+                                                   math::multiplies{},
+                                                   Number<1>{}));
+    static_for<1, num_access, 1>{}([&](auto i) {
+        constexpr auto idx_curr = SpaceFillingCurve::GetIndex(i);
+        static_assert(idx_curr[I0] == expected[i][I0]);
+        static_assert(idx_curr[I1] == expected[i][I1]);
+        static_assert(idx_curr[I2] == expected[i][I2]);
+        constexpr auto backward_step = SpaceFillingCurve::GetBackwardStep(i);
+        constexpr auto expected_step = expected[i - I1] - expected[i];
+        static_assert(backward_step[I0] == expected_step[I0]);
+        static_assert(backward_step[I1] == expected_step[I1]);
+        static_assert(backward_step[I2] == expected_step[I2]);
+    });
+    static_for<0, num_access - 1, 1>{}([&](auto i) {
+        constexpr auto idx_curr = SpaceFillingCurve::GetIndex(i);
+        static_assert(idx_curr[I0] == expected[i][I0]);
+        static_assert(idx_curr[I1] == expected[i][I1]);
+        static_assert(idx_curr[I2] == expected[i][I2]);
+        constexpr auto forward_step  = SpaceFillingCurve::GetForwardStep(i);
+        constexpr auto expected_step = expected[i + I1] - expected[i];
+        static_assert(forward_step[I0] == expected_step[I0]);
+        static_assert(forward_step[I1] == expected_step[I1]);
+        static_assert(forward_step[I2] == expected_step[I2]);
+    });
+}
+void traverse_using_space_filling_curve_snakecurved()
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    using TensorLengths    = Sequence<16, 10, 9>;
+    using DimAccessOrder   = Sequence<2, 0, 1>;
+    using ScalarsPerAccess = Sequence<4, 2, 3>;
+    using SpaceFillingCurve =
+        SpaceFillingCurve<TensorLengths, DimAccessOrder, ScalarsPerAccess, true>;
    constexpr auto expected = make_tuple(make_tuple(0, 0, 0),
                                         make_tuple(0, 2, 0),