Merge branch 'develop' into aosewski/wavelet_omniperf

dc0bae32 · Adam Osewski · 68474822 · ba40c2ce · dc0bae32 · dc0bae32
Commit dc0bae32 authored Feb 01, 2023 by Adam Osewski
20 changed files
--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include <cstring>
+#include <cstdlib>
+#include <iostream>
-int profile_gemm(int, char*[]);
+#include "profiler_operation_registry.hpp"
-int profile_gemm_splitk(int, char*[]);
-int profile_gemm_bilinear(int, char*[]);
-int profile_gemm_add_add_fastgelu(int, char*[]);
-int profile_gemm_reduce(int, char*[]);
-int profile_gemm_bias_add_reduce(int, char*[]);
-int profile_batched_gemm(int, char*[]);
-int profile_batched_gemm_gemm(int, char*[]);
-int profile_batched_gemm_add_relu_gemm_add(int, char*[]);
-int profile_batched_gemm_reduce(int, char*[]);
-int profile_grouped_gemm(int, char*[]);
-int profile_conv_fwd(int, char*[]);
-int profile_conv_fwd_bias_relu(int, char*[]);
-int profile_conv_fwd_bias_relu_add(int, char*[]);
-int profile_conv_bwd_data(int, char*[]);
-int profile_grouped_conv_fwd(int, char*[]);
-int profile_grouped_conv_bwd_weight(int, char*[]);
-int profile_softmax(int, char*[]);
-int profile_layernorm(int, char*[]);
-int profile_groupnorm(int, char*[]);
-int profile_reduce(int, char*[]);
 static void print_helper_message()
 {
-    // clang-format off
+    std::cout << "arg1: tensor operation " << ProfilerOperationRegistry::GetInstance() << std::endl;
-    printf("arg1: tensor operation (gemm: GEMM\n"
-           "                        gemm_splitk: Split-K GEMM\n"
-           "                        gemm_bilinear: GEMM+Bilinear\n"
-           "                        gemm_add_add_fastgelu: GEMM+Add+Add+FastGeLU\n"
-           "                        gemm_reduce: GEMM+Reduce\n"
-           "                        gemm_bias_add_reduce: GEMM+Bias+Add+Reduce\n"
-           "                        batched_gemm: Batched GEMM\n"
-           "                        batched_gemm_gemm: Batched+GEMM+GEMM\n"
-           "                        batched_gemm_add_relu_gemm_add: Batched+GEMM+bias+gelu+GEMM+bias\n"
-           "                        batched_gemm_reduce: Batched GEMM+Reduce\n"
-           "                        grouped_gemm: Grouped GEMM\n"
-           "                        conv_fwd: Convolution Forward\n"
-           "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
-           "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
-           "                        conv_bwd_data: Convolution Backward Data\n"
-           "                        grouped_conv_fwd: Grouped Convolution Forward\n"
-           "                        grouped_conv_bwd_weight: Grouped Convolution Backward Weight\n"
-           "                        softmax: Softmax\n"
-           "                        reduce: Reduce\n");
-    // clang-format on
 }
 int main(int argc, char* argv[])
@@ -55,97 +16,15 @@ int main(int argc, char* argv[])
    if(argc == 1)
    {
        print_helper_message();
-        return 0;
-    }
-    else if(strcmp(argv[1], "gemm") == 0)
-    {
-        return profile_gemm(argc, argv);
-    }
-    else if(strcmp(argv[1], "gemm_splitk") == 0)
-    {
-        return profile_gemm_splitk(argc, argv);
-    }
-    else if(strcmp(argv[1], "gemm_bilinear") == 0)
-    {
-        return profile_gemm_bilinear(argc, argv);
-    }
-    else if(strcmp(argv[1], "gemm_add_add_fastgelu") == 0)
-    {
-        return profile_gemm_add_add_fastgelu(argc, argv);
-    }
-    else if(strcmp(argv[1], "gemm_reduce") == 0)
-    {
-        return profile_gemm_reduce(argc, argv);
    }
-    else if(strcmp(argv[1], "gemm_bias_add_reduce") == 0)
+    else if(const auto operation = ProfilerOperationRegistry::GetInstance().Get(argv[1]);
+            operation.has_value())
    {
-        return profile_gemm_bias_add_reduce(argc, argv);
+        return (*operation)(argc, argv);
-    }
-    else if(strcmp(argv[1], "batched_gemm") == 0)
-    {
-        return profile_batched_gemm(argc, argv);
-    }
-    else if(strcmp(argv[1], "batched_gemm_gemm") == 0)
-    {
-        return profile_batched_gemm_gemm(argc, argv);
-    }
-    else if(strcmp(argv[1], "batched_gemm_add_relu_gemm_add") == 0)
-    {
-        return profile_batched_gemm_add_relu_gemm_add(argc, argv);
-    }
-    else if(strcmp(argv[1], "batched_gemm_reduce") == 0)
-    {
-        return profile_batched_gemm_reduce(argc, argv);
-    }
-    else if(strcmp(argv[1], "grouped_gemm") == 0)
-    {
-        return profile_grouped_gemm(argc, argv);
-    }
-    else if(strcmp(argv[1], "conv_fwd") == 0)
-    {
-        return profile_conv_fwd(argc, argv);
-    }
-    else if(strcmp(argv[1], "conv_fwd_bias_relu") == 0)
-    {
-        return profile_conv_fwd_bias_relu(argc, argv);
-    }
-    else if(strcmp(argv[1], "conv_fwd_bias_relu_add") == 0)
-    {
-        return profile_conv_fwd_bias_relu_add(argc, argv);
-    }
-    else if(strcmp(argv[1], "conv_bwd_data") == 0)
-    {
-        return profile_conv_bwd_data(argc, argv);
-    }
-    else if(strcmp(argv[1], "grouped_conv_fwd") == 0)
-    {
-        return profile_grouped_conv_fwd(argc, argv);
-    }
-    else if(strcmp(argv[1], "conv_bwd_weight") == 0)
-    {
-        return profile_grouped_conv_bwd_weight(argc, argv);
-    }
-    else if(strcmp(argv[1], "reduce") == 0)
-    {
-        return profile_reduce(argc, argv);
-    }
-    else if(strcmp(argv[1], "softmax") == 0)
-    {
-        return profile_softmax(argc, argv);
-    }
-    else if(strcmp(argv[1], "layernorm") == 0)
-    {
-        return profile_layernorm(argc, argv);
-    }
-    else if(strcmp(argv[1], "groupnorm") == 0)
-    {
-        return profile_groupnorm(argc, argv);
    }
    else
    {
-        print_helper_message();
+        std::cerr << "cannot find operation: " << argv[1] << std::endl;
+        return EXIT_FAILURE;
-        return 0;
    }
 }
--- a/profiler/src/profiler_operation_registry.hpp
+++ b/profiler/src/profiler_operation_registry.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <optional>
+#include <string_view>
+#include <utility>
+class ProfilerOperationRegistry final
+{
+    ProfilerOperationRegistry()  = default;
+    ~ProfilerOperationRegistry() = default;
+    public:
+    using Operation = std::function<int(int, char*[])>;
+    private:
+    struct Entry final
+    {
+        explicit Entry(std::string_view description, Operation operation) noexcept
+            : description_(description), operation_(std::move(operation))
+        {
+        }
+        std::string_view description_;
+        Operation operation_;
+    };
+    std::map<std::string_view, Entry> entries_;
+    friend std::ostream& operator<<(std::ostream& stream, const ProfilerOperationRegistry& registry)
+    {
+        stream << "{\n";
+        for(auto& [name, entry] : registry.entries_)
+        {
+            stream << "\t" << name << ": " << entry.description_ << "\n";
+        }
+        stream << "}";
+        return stream;
+    }
+    public:
+    static ProfilerOperationRegistry& GetInstance()
+    {
+        static ProfilerOperationRegistry registry;
+        return registry;
+    }
+    std::optional<Operation> Get(std::string_view name) const
+    {
+        const auto found = entries_.find(name);
+        if(found == end(entries_))
+        {
+            return std::nullopt;
+        }
+        return (found->second).operation_;
+    }
+    bool Add(std::string_view name, std::string_view description, Operation operation)
+    {
+        return entries_
+            .emplace(std::piecewise_construct,
+                     std::forward_as_tuple(name),
+                     std::forward_as_tuple(description, std::move(operation)))
+            .second;
+    }
+};
+#define PP_CONCAT(x, y) PP_CONCAT_IMPL(x, y)
+#define PP_CONCAT_IMPL(x, y) x##y
+#define REGISTER_PROFILER_OPERATION(name, description, operation)              \
+    static const bool PP_CONCAT(operation_registration_result_, __COUNTER__) = \
+        ::ProfilerOperationRegistry::GetInstance().Add(name, description, operation)
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
 include_directories(BEFORE
    ${PROJECT_SOURCE_DIR}/
+    ${PROJECT_SOURCE_DIR}/profiler/include
 )
 include(googletest)
@@ -53,3 +54,7 @@ add_subdirectory(softmax)
 add_subdirectory(normalization)
 add_subdirectory(data_type)
 add_subdirectory(elementwise_normalization)
+add_subdirectory(batchnorm)
+if(GPU_TARGETS MATCHES "gfx1100")
+    add_subdirectory(wmma_op)
+endif()
--- a/test/batched_gemm/batched_gemm_bf16.cpp
+++ b/test/batched_gemm/batched_gemm_bf16.cpp
@@ -3,7 +3,7 @@
 #include <iostream>
-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"
 namespace {
 using ADataType = ck::bhalf_t;

--- a/test/batched_gemm/batched_gemm_fp16.cpp
+++ b/test/batched_gemm/batched_gemm_fp16.cpp
@@ -3,7 +3,7 @@
 #include <iostream>
-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"
 namespace {
 using ADataType = ck::half_t;

--- a/test/batched_gemm/batched_gemm_fp32.cpp
+++ b/test/batched_gemm/batched_gemm_fp32.cpp
@@ -3,7 +3,7 @@
 #include <iostream>
-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"
 namespace {
 using ADataType = float;

--- a/test/batched_gemm/batched_gemm_int8.cpp
+++ b/test/batched_gemm/batched_gemm_int8.cpp
@@ -3,7 +3,7 @@
 #include <iostream>
-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"
 namespace {
 using ADataType = int8_t;

--- a/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
@@ -6,7 +6,7 @@
 #include <vector>
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
-#include "profiler/include/profile_batched_gemm_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_gemm_impl.hpp"
 using ck::tensor_operation::device::GemmSpecialization;

--- a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
+++ b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
@@ -3,7 +3,7 @@
 #include <iostream>
-#include "profiler/include/profile_batched_gemm_reduce_impl.hpp"
+#include "profiler/profile_batched_gemm_reduce_impl.hpp"
 int main()
 {

--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
@@ -6,7 +6,7 @@
 #include <vector>
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp"
-#include "profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_softmax_gemm_impl.hpp"
 using ck::tensor_operation::device::GemmSpecialization;
 template <ck::index_t N>

--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp
@@ -27,7 +27,7 @@ using KernelTypes = ::testing::Types<
 TYPED_TEST_SUITE(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, KernelTypes);
-TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16) { this->Run(); }
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, DISABLED_Test_BF16) { this->Run(); }
 TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadM)
 {

--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
@@ -7,7 +7,7 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
-#include "profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp"
+#include "profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp"
 using ck::tensor_operation::device::GemmSpecialization;
 using ck::tensor_operation::device::MaskingSpecialization;

--- a/test/batchnorm/CMakeLists.txt
+++ b/test/batchnorm/CMakeLists.txt
+add_gtest_executable(test_batchnorm_fwd_rank_4 batchnorm_fwd_rank_4.cpp)
+add_gtest_executable(test_batchnorm_bwd_rank_4 batchnorm_bwd_rank_4.cpp)
+add_gtest_executable(test_batchnorm_infer_rank_4 batchnorm_infer_rank_4.cpp)
+target_link_libraries(test_batchnorm_fwd_rank_4 PRIVATE utility device_batchnorm_instance)
+target_link_libraries(test_batchnorm_bwd_rank_4 PRIVATE utility device_batchnorm_instance)
+target_link_libraries(test_batchnorm_infer_rank_4 PRIVATE utility device_batchnorm_instance)
--- a/test/batchnorm/batchnorm_bwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_bwd_rank_4.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <tuple>
+#include <gtest/gtest.h>
+#include "profiler/profile_batchnorm_backward_impl.hpp"
+using F16  = ck::half_t;
+using F32  = float;
+using BF16 = ck::bhalf_t;
+using F64  = double;
+template <typename Tuple>
+class TestBatchNormBwdRank4 : public ::testing::Test
+{
+    private:
+    const double epsilon = std::numeric_limits<float>::epsilon();
+    protected:
+    using XDataType       = std::tuple_element_t<0, Tuple>;
+    using DxDataType      = std::tuple_element_t<1, Tuple>;
+    using DyDataType      = std::tuple_element_t<2, Tuple>;
+    using AccDataType     = std::tuple_element_t<3, Tuple>;
+    using ScaleDataType   = std::tuple_element_t<4, Tuple>;
+    using BiasDataType    = std::tuple_element_t<5, Tuple>;
+    using MeanVarDataType = std::tuple_element_t<6, Tuple>;
+    std::vector<std::vector<size_t>> list_of_lengths = {
+        {128, 16, 3, 1024}, {128, 16, 6, 512}, {1, 1, 1, 1}, {4, 4, 4, 4}, {32, 32, 32, 32}};
+    std::vector<int> reduceDims;
+    template <int NumReduceDim>
+    void Run()
+    {
+        for(auto& inOutLengths : list_of_lengths)
+        {
+            bool pass = true;
+            EXPECT_FALSE(reduceDims.size() != NumReduceDim);
+            pass = pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
+                                                                         DxDataType,
+                                                                         DyDataType,
+                                                                         AccDataType,
+                                                                         ScaleDataType,
+                                                                         BiasDataType,
+                                                                         MeanVarDataType,
+                                                                         4,
+                                                                         NumReduceDim>(
+                               true, 3, false, false, inOutLengths, reduceDims, true, epsilon);
+            pass = pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
+                                                                         DxDataType,
+                                                                         DyDataType,
+                                                                         AccDataType,
+                                                                         ScaleDataType,
+                                                                         BiasDataType,
+                                                                         MeanVarDataType,
+                                                                         4,
+                                                                         NumReduceDim>(
+                               true, 3, false, false, inOutLengths, reduceDims, false, epsilon);
+            EXPECT_TRUE(pass);
+        }
+    }
+};
+using KernelTypes = ::testing::Types<std::tuple<F16, F32, F32, F32, F16, F32, F32>,
+                                     std::tuple<F32, F32, F32, F32, F32, F32, F32>,
+                                     std::tuple<BF16, F32, F32, F32, BF16, F32, F32>,
+                                     std::tuple<F64, F64, F64, F64, F64, F64, F64>>;
+TYPED_TEST_SUITE(TestBatchNormBwdRank4, KernelTypes);
+// nhwc
+TYPED_TEST(TestBatchNormBwdRank4, nhwc)
+{
+    this->reduceDims = {0, 1, 2};
+    this->template Run<3>();
+}
+// nchw
+TYPED_TEST(TestBatchNormBwdRank4, nchw)
+{
+    this->reduceDims = {0, 2, 3};
+    this->template Run<3>();
+}
--- a/test/batchnorm/batchnorm_fwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_fwd_rank_4.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <tuple>
+#include <gtest/gtest.h>
+#include "profiler/profile_batchnorm_forward_impl.hpp"
+using F16  = ck::half_t;
+using F32  = float;
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F64  = double;
+template <typename Tuple>
+class TestBatchNormFwdRank4 : public ::testing::Test
+{
+    private:
+    const double epsilon       = std::numeric_limits<float>::epsilon();
+    const double averageFactor = 0.1;
+    protected:
+    using XDataType       = std::tuple_element_t<0, Tuple>;
+    using YDataType       = std::tuple_element_t<1, Tuple>;
+    using AccDataType     = std::tuple_element_t<2, Tuple>;
+    using ScaleDataType   = std::tuple_element_t<3, Tuple>;
+    using BiasDataType    = std::tuple_element_t<4, Tuple>;
+    using MeanVarDataType = std::tuple_element_t<5, Tuple>;
+    std::vector<std::vector<size_t>> list_of_lengths = {
+        {128, 16, 3, 1024}, {128, 16, 6, 512}, {1, 1, 1, 1}, {4, 4, 4, 4}, {32, 32, 32, 32}};
+    std::vector<int> reduceDims;
+    template <int NumReduceDim>
+    void Run()
+    {
+        for(auto& inOutLengths : list_of_lengths)
+        {
+            bool pass = true;
+            EXPECT_FALSE(reduceDims.size() != NumReduceDim);
+            pass =
+                pass && ck::profiler::profile_batchnorm_forward_impl<XDataType,
+                                                                     YDataType,
+                                                                     AccDataType,
+                                                                     ScaleDataType,
+                                                                     BiasDataType,
+                                                                     MeanVarDataType,
+                                                                     4,
+                                                                     NumReduceDim>(true,
+                                                                                   3,
+                                                                                   false,
+                                                                                   false,
+                                                                                   inOutLengths,
+                                                                                   reduceDims,
+                                                                                   true,
+                                                                                   true,
+                                                                                   epsilon,
+                                                                                   averageFactor);
+            pass =
+                pass && ck::profiler::profile_batchnorm_forward_impl<XDataType,
+                                                                     YDataType,
+                                                                     AccDataType,
+                                                                     ScaleDataType,
+                                                                     BiasDataType,
+                                                                     MeanVarDataType,
+                                                                     4,
+                                                                     NumReduceDim>(true,
+                                                                                   3,
+                                                                                   false,
+                                                                                   false,
+                                                                                   inOutLengths,
+                                                                                   reduceDims,
+                                                                                   false,
+                                                                                   false,
+                                                                                   epsilon,
+                                                                                   averageFactor);
+            EXPECT_TRUE(pass);
+        }
+    }
+};
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, F32>,
+                                     std::tuple<F32, F32, F32, F32, F32, F32>,
+                                     std::tuple<BF16, BF16, F32, BF16, BF16, F32>,
+                                     std::tuple<F64, F64, F64, F64, F64, F64>>;
+TYPED_TEST_SUITE(TestBatchNormFwdRank4, KernelTypes);
+// nhwc
+TYPED_TEST(TestBatchNormFwdRank4, nhwc)
+{
+    this->reduceDims = {0, 1, 2};
+    this->template Run<3>();
+}
+// nchw
+TYPED_TEST(TestBatchNormFwdRank4, nchw)
+{
+    this->reduceDims = {0, 2, 3};
+    this->template Run<3>();
+}
--- a/test/batchnorm/batchnorm_infer_rank_4.cpp
+++ b/test/batchnorm/batchnorm_infer_rank_4.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <tuple>
+#include <gtest/gtest.h>
+#include "profiler/profile_batchnorm_infer_impl.hpp"
+using F16  = ck::half_t;
+using F32  = float;
+using BF16 = ck::bhalf_t;
+using F64  = double;
+template <typename Tuple>
+class TestBatchNormInferRank4 : public ::testing::Test
+{
+    private:
+    const double epsilon = std::numeric_limits<float>::epsilon();
+    protected:
+    using XDataType       = std::tuple_element_t<0, Tuple>;
+    using YDataType       = std::tuple_element_t<1, Tuple>;
+    using AccDataType     = std::tuple_element_t<2, Tuple>;
+    using ScaleDataType   = std::tuple_element_t<3, Tuple>;
+    using BiasDataType    = std::tuple_element_t<4, Tuple>;
+    using MeanVarDataType = std::tuple_element_t<5, Tuple>;
+    std::vector<std::vector<size_t>> list_of_lengths = {
+        {128, 16, 3, 1024}, {128, 16, 6, 512}, {4, 4, 4, 4}, {32, 32, 32, 32}};
+    std::vector<int> reduceDims;
+    template <int NumReduceDim>
+    void Run()
+    {
+        for(auto& inOutLengths : list_of_lengths)
+        {
+            bool pass = true;
+            EXPECT_FALSE(reduceDims.size() != NumReduceDim);
+            pass = pass && ck::profiler::profile_batchnorm_infer_impl<XDataType,
+                                                                      YDataType,
+                                                                      AccDataType,
+                                                                      ScaleDataType,
+                                                                      BiasDataType,
+                                                                      MeanVarDataType,
+                                                                      4,
+                                                                      NumReduceDim>(
+                               true, 3, false, false, inOutLengths, reduceDims, epsilon);
+            pass = pass && ck::profiler::profile_batchnorm_infer_impl<XDataType,
+                                                                      YDataType,
+                                                                      AccDataType,
+                                                                      ScaleDataType,
+                                                                      BiasDataType,
+                                                                      MeanVarDataType,
+                                                                      4,
+                                                                      NumReduceDim>(
+                               true, 3, false, false, inOutLengths, reduceDims, epsilon);
+            EXPECT_TRUE(pass);
+        }
+    }
+};
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, F32>,
+                                     std::tuple<F32, F32, F32, F32, F32, F32>,
+                                     std::tuple<BF16, BF16, F32, BF16, BF16, F32>,
+                                     std::tuple<F64, F64, F64, F64, F64, F64>>;
+TYPED_TEST_SUITE(TestBatchNormInferRank4, KernelTypes);
+// nhwc
+TYPED_TEST(TestBatchNormInferRank4, nhwc)
+{
+    this->reduceDims = {0, 1, 2};
+    this->template Run<3>();
+}
+// nchw
+TYPED_TEST(TestBatchNormInferRank4, nchw)
+{
+    this->reduceDims = {0, 2, 3};
+    this->template Run<3>();
+}
--- a/test/convnd_bwd_data/convnd_bwd_data.cpp
+++ b/test/convnd_bwd_data/convnd_bwd_data.cpp
@@ -8,7 +8,7 @@
 #include <tuple>
 #include <gtest/gtest.h>
-#include "profiler/include/profile_conv_bwd_data_impl.hpp"
+#include "profiler/profile_conv_bwd_data_impl.hpp"
 template <typename Tuple>
 class TestConvndBwdData : public ::testing::Test

--- a/test/convnd_fwd/convnd_fwd.cpp
+++ b/test/convnd_fwd/convnd_fwd.cpp
@@ -8,7 +8,7 @@
 #include <tuple>
 #include <gtest/gtest.h>
-#include "profiler/include/profile_conv_fwd_impl.hpp"
+#include "profiler/profile_conv_fwd_impl.hpp"
 template <typename Tuple>
 class TestConvndFwd : public ::testing::Test

--- a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
+++ b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include "gtest/gtest.h"
-#include "profiler/include/profile_elementwise_layernorm_impl.hpp"
+#include "profiler/profile_elementwise_layernorm_impl.hpp"
 using F16 = ck::half_t;
 using F32 = float;

--- a/test/gemm_reduce/gemm_reduce_fp16.cpp
+++ b/test/gemm_reduce/gemm_reduce_fp16.cpp
@@ -3,7 +3,7 @@
 #include <iostream>
-#include "profiler/include/profile_gemm_reduce_impl.hpp"
+#include "profiler/profile_gemm_reduce_impl.hpp"
 int main()
 {