Merge remote-tracking branch 'origin/develop' into cpu_avx2

b134b7d6 · carlushuang · 090ba885 · 9f71ff48 · b134b7d6 · b134b7d6
Commit b134b7d6 authored May 16, 2022 by carlushuang
11 changed files
--- a/test/gemm/gemm_fp32.cpp
+++ b/test/gemm/gemm_fp32.cpp
@@ -15,7 +15,7 @@
 #include "host_gemm.hpp"
 #include "device_tensor.hpp"
 #include "device_gemm_xdl.hpp"
-#include "device_gemm_xdl_c_shuffle.hpp"
+#include "device_gemm_xdl_cshuffle.hpp"
 #include "element_wise_operation.hpp"
 #include "reference_gemm.hpp"
 #include "gemm_specialization.hpp"

--- a/test/gemm/gemm_int8.cpp
+++ b/test/gemm/gemm_int8.cpp
@@ -15,7 +15,7 @@
 #include "host_gemm.hpp"
 #include "device_tensor.hpp"
 #include "device_gemm_xdl.hpp"
-#include "device_gemm_xdl_c_shuffle.hpp"
+#include "device_gemm_xdl_cshuffle.hpp"
 #include "element_wise_operation.hpp"
 #include "reference_gemm.hpp"
 #include "gemm_specialization.hpp"

--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
-#ifndef GEMM_UTILS_HPP
-#define GEMM_UTILS_HPP
-
-#include "check_err.hpp"
-#include "config.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "reference_gemm.hpp"
-#include "tensor_layout.hpp"
-
-namespace ck {
-namespace gemm_util {
-
-struct GemmParams
-{
-    GemmParams()
-        : M(1024), N(1024), K(1024), StrideA(1024), StrideB(1024), StrideC(1024), alpha(1), beta(0)
-    {
-    }
-
-    ck::index_t M;
-    ck::index_t N;
-    ck::index_t K;
-
-    ck::index_t StrideA;
-    ck::index_t StrideB;
-    ck::index_t StrideC;
-
-    float alpha;
-    float beta;
-};
-
-template <typename GemmInstance,
-          typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-void RunHostGEMM(const Tensor<ADataType>& A,
-                 const Tensor<BDataType>& B,
-                 Tensor<CDataType>& C,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op)
-{
-    auto ref_gemm    = GemmInstance{};
-    auto ref_invoker = ref_gemm.MakeInvoker();
-
-    auto ref_argument = ref_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op);
-
-    ref_invoker.Run(ref_argument);
-}
-
-template <typename DeviceGemmPtr_,
-          typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
-                   const ck::gemm_util::GemmParams& params,
-                   const Tensor<ADataType>& A,
-                   const Tensor<BDataType>& B,
-                   Tensor<CDataType>& C,
-                   AElementwiseOperation a_element_op,
-                   BElementwiseOperation b_element_op,
-                   CElementwiseOperation c_element_op)
-{
-    DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpace());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpace());
-    DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpace());
-
-    a_m_k_device_buf.ToDevice(A.mData.data());
-    b_k_n_device_buf.ToDevice(B.mData.data());
-
-    auto invoker_ptr = gemmPtr->MakeInvokerPointer();
-    auto argument_ptr =
-        gemmPtr->MakeArgumentPointer(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
-                                     static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
-                                     static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
-                                     params.M,
-                                     params.N,
-                                     params.K,
-                                     params.StrideA,
-                                     params.StrideB,
-                                     params.StrideC,
-                                     a_element_op,
-                                     b_element_op,
-                                     c_element_op);
-
-    if(!gemmPtr->IsSupportedArgument(argument_ptr.get()))
-    {
-        throw std::runtime_error(
-            "wrong! device_gemm with the specified compilation parameters does "
-            "not support this GEMM problem");
-    }
-
-    invoker_ptr->Run(argument_ptr.get());
-    c_m_n_device_buf.FromDevice(C.mData.data());
-}
-
-template <typename DeviceGemmPtr_,
-          typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-struct TestGemm
-{
-    auto PrepareGemmTensor(const ck::gemm_util::GemmParams& params)
-    {
-        auto f_host_tensor_descriptor =
-            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-                if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
-                {
-                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                                std::vector<std::size_t>({stride, 1}));
-                }
-                else
-                {
-                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                                std::vector<std::size_t>({1, stride}));
-                }
-            };
-
-        Tensor<ADataType> a_m_k(
-            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
-        Tensor<BDataType> b_k_n(
-            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
-        Tensor<CDataType> c_m_n_host_result(
-            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
-        Tensor<CDataType> c_m_n_device_result(
-            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
-
-        auto f_generate_tensor_value = [](auto desc, auto type) {
-            using dataType = decltype(type);
-
-            if(std::is_same<dataType, int8_t>::value)
-            {
-                desc.GenerateTensorValue(GeneratorTensor_2<int8_t>{-5, 5});
-            }
-            else
-            {
-                desc.GenerateTensorValue(GeneratorTensor_3<dataType>{-0.5, 0.5});
-            }
-        };
-
-        f_generate_tensor_value(a_m_k, ADataType{});
-        f_generate_tensor_value(b_k_n, BDataType{});
-
-        return std::make_tuple(a_m_k, b_k_n, c_m_n_host_result, c_m_n_device_result);
-    }
-
-    auto operator()(DeviceGemmPtr_& gemmPtr)
-    {
-        std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name
-                  << ", CLayout = " << CLayout{}.name << std::endl;
-        std::cout << gemmPtr->GetTypeString() << std::endl;
-
-        // Arrange
-        ck::gemm_util::GemmParams params;
-        params.M       = 1024;
-        params.N       = 1024;
-        params.K       = 1024;
-        params.StrideA = 1024;
-        params.StrideB = 1024;
-        params.StrideC = 1024;
-
-        auto host_tensors = PrepareGemmTensor(params);
-
-        const Tensor<ADataType>& a  = std::get<0>(host_tensors);
-        const Tensor<BDataType>& b  = std::get<1>(host_tensors);
-        Tensor<CDataType>& c_host   = std::get<2>(host_tensors);
-        Tensor<CDataType>& c_device = std::get<3>(host_tensors);
-
-        auto a_element_op = AElementwiseOperation{};
-        auto b_element_op = BElementwiseOperation{};
-        auto c_element_op = CElementwiseOperation{};
-
-        using ReferenceGemmInstance =
-            ck::tensor_operation::host::ReferenceGemm<ADataType,
-                                                      BDataType,
-                                                      CDataType,
-                                                      AElementwiseOperation,
-                                                      BElementwiseOperation,
-                                                      CElementwiseOperation>;
-        ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>(
-            a, b, c_host, a_element_op, b_element_op, c_element_op);
-
-        // Act
-        ck::gemm_util::RunDeviceGEMM(
-            gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op);
-
-        // Assert
-        bool res = false;
-        if(std::is_same<CDataType, float>::value)
-        {
-            res = ck::utils::check_err(c_device.mData, c_host.mData);
-            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
-        }
-        else if(std::is_same<CDataType, ck::half_t>::value)
-        {
-            res = ck::utils::check_err(c_device.mData, c_host.mData);
-            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
-        }
-        else if(std::is_same<CDataType, int8_t>::value)
-        {
-            res = ck::utils::check_err(c_device.mData, c_host.mData);
-            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
-        }
-
-        return res;
-    }
-};
-
-template <typename DeviceGemmPtr_,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-struct TestGemmBF16
-{
-    using BF16 = ck::bhalf_t;
-
-    auto PrepareGemmTensorBF16(const ck::gemm_util::GemmParams& params)
-    {
-        auto f_host_tensor_descriptor =
-            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-                if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
-                {
-                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                                std::vector<std::size_t>({stride, 1}));
-                }
-                else
-                {
-                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                                std::vector<std::size_t>({1, stride}));
-                }
-            };
-
-        // use fp32 host kernel to verify bf16 device kernel
-        Tensor<BF16> a_m_k_bf16(
-            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
-        Tensor<BF16> b_k_n_bf16(
-            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
-        Tensor<BF16> c_m_n_device_bf16(
-            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
-
-        Tensor<float> a_m_k_fp32(
-            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
-        Tensor<float> b_k_n_fp32(
-            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
-        Tensor<float> c_m_n_host_fp32(
-            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
-        Tensor<float> c_m_n_device_fp32(
-            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
-
-        a_m_k_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5});
-        b_k_n_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5});
-
-        bf16_to_f32_(a_m_k_bf16, a_m_k_fp32);
-        bf16_to_f32_(b_k_n_bf16, b_k_n_fp32);
-
-        return std::make_tuple(a_m_k_bf16,
-                               b_k_n_bf16,
-                               c_m_n_device_bf16,
-                               a_m_k_fp32,
-                               b_k_n_fp32,
-                               c_m_n_host_fp32,
-                               c_m_n_device_fp32);
-    }
-
-    auto operator()(DeviceGemmPtr_& gemmPtr)
-    {
-        // Arrange
-        ck::gemm_util::GemmParams params;
-        params.M       = 1024;
-        params.N       = 1024;
-        params.K       = 1024;
-        params.StrideA = 1024;
-        params.StrideB = 1024;
-        params.StrideC = 1024;
-
-        auto host_tensors            = PrepareGemmTensorBF16(params);
-        const Tensor<BF16>& a_bf16   = std::get<0>(host_tensors);
-        const Tensor<BF16>& b_bf16   = std::get<1>(host_tensors);
-        Tensor<BF16>& c_device_bf16  = std::get<2>(host_tensors);
-        Tensor<float>& a_fp32        = std::get<3>(host_tensors);
-        Tensor<float>& b_fp32        = std::get<4>(host_tensors);
-        Tensor<float>& c_host_fp32   = std::get<5>(host_tensors);
-        Tensor<float>& c_device_fp32 = std::get<6>(host_tensors);
-
-        auto a_element_op = AElementwiseOperation{};
-        auto b_element_op = BElementwiseOperation{};
-        auto c_element_op = CElementwiseOperation{};
-
-        // use fp32 host kernel to verify bf16 device kernel
-        using ReferenceGemmInstance =
-            ck::tensor_operation::host::ReferenceGemm<float,
-                                                      float,
-                                                      float,
-                                                      AElementwiseOperation,
-                                                      BElementwiseOperation,
-                                                      CElementwiseOperation>;
-        ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>(
-            a_fp32, b_fp32, c_host_fp32, a_element_op, b_element_op, c_element_op);
-
-        // Act
-        ck::gemm_util::RunDeviceGEMM(gemmPtr,
-                                     params,
-                                     a_bf16,
-                                     b_bf16,
-                                     c_device_bf16,
-                                     a_element_op,
-                                     b_element_op,
-                                     c_element_op);
-
-        bf16_to_f32_(c_device_bf16, c_device_fp32);
-
-        // Assert
-        bool res = ck::utils::check_err(
-            c_device_fp32.mData, c_host_fp32.mData, "Error: incorrect results!", 1e-2f, 1e-3f);
-        std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
-
-        return res;
-    };
-};
-
-} // namespace gemm_util
-} // namespace ck
-#endif
+#ifndef GEMM_UTILS_HPP
+#define GEMM_UTILS_HPP
+
+#include "check_err.hpp"
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "reference_gemm.hpp"
+#include "tensor_layout.hpp"
+
+namespace ck {
+namespace gemm_util {
+
+struct GemmParams
+{
+    GemmParams()
+        : M(1024), N(1024), K(1024), StrideA(1024), StrideB(1024), StrideC(1024), alpha(1), beta(0)
+    {
+    }
+
+    ck::index_t M;
+    ck::index_t N;
+    ck::index_t K;
+
+    ck::index_t StrideA;
+    ck::index_t StrideB;
+    ck::index_t StrideC;
+
+    float alpha;
+    float beta;
+};
+
+template <typename GemmInstance,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+void RunHostGEMM(const Tensor<ADataType>& A,
+                 const Tensor<BDataType>& B,
+                 Tensor<CDataType>& C,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+{
+    auto ref_gemm    = GemmInstance{};
+    auto ref_invoker = ref_gemm.MakeInvoker();
+
+    auto ref_argument = ref_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op);
+
+    ref_invoker.Run(ref_argument);
+}
+
+template <typename DeviceGemmPtr_,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
+                   const ck::gemm_util::GemmParams& params,
+                   const Tensor<ADataType>& A,
+                   const Tensor<BDataType>& B,
+                   Tensor<CDataType>& C,
+                   AElementwiseOperation a_element_op,
+                   BElementwiseOperation b_element_op,
+                   CElementwiseOperation c_element_op)
+{
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpace());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpace());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpace());
+
+    a_m_k_device_buf.ToDevice(A.mData.data());
+    b_k_n_device_buf.ToDevice(B.mData.data());
+
+    auto invoker_ptr = gemmPtr->MakeInvokerPointer();
+    auto argument_ptr =
+        gemmPtr->MakeArgumentPointer(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                                     static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                     static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                     params.M,
+                                     params.N,
+                                     params.K,
+                                     params.StrideA,
+                                     params.StrideB,
+                                     params.StrideC,
+                                     a_element_op,
+                                     b_element_op,
+                                     c_element_op);
+
+    if(!gemmPtr->IsSupportedArgument(argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    invoker_ptr->Run(argument_ptr.get());
+    c_m_n_device_buf.FromDevice(C.mData.data());
+}
+
+template <typename DeviceGemmPtr_,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct TestGemm
+{
+    auto PrepareGemmTensor(const ck::gemm_util::GemmParams& params)
+    {
+        auto f_host_tensor_descriptor =
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+                {
+                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                                std::vector<std::size_t>({stride, 1}));
+                }
+                else
+                {
+                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                                std::vector<std::size_t>({1, stride}));
+                }
+            };
+
+        Tensor<ADataType> a_m_k(
+            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
+        Tensor<BDataType> b_k_n(
+            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
+        Tensor<CDataType> c_m_n_host_result(
+            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+        Tensor<CDataType> c_m_n_device_result(
+            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+
+        auto f_generate_tensor_value = [](auto& tensor, auto type) {
+            using dataType = decltype(type);
+
+            tensor.GenerateTensorValue(GeneratorTensor_2<dataType>{-5, 5});
+        };
+
+        f_generate_tensor_value(a_m_k, ADataType{});
+        f_generate_tensor_value(b_k_n, BDataType{});
+
+        return std::make_tuple(a_m_k, b_k_n, c_m_n_host_result, c_m_n_device_result);
+    }
+
+    auto operator()(DeviceGemmPtr_& gemmPtr)
+    {
+        std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name
+                  << ", CLayout = " << CLayout{}.name << std::endl;
+        std::cout << gemmPtr->GetTypeString() << std::endl;
+
+        // Arrange
+        ck::gemm_util::GemmParams params;
+        params.M       = 1024;
+        params.N       = 1024;
+        params.K       = 1024;
+        params.StrideA = 1024;
+        params.StrideB = 1024;
+        params.StrideC = 1024;
+
+        auto host_tensors = PrepareGemmTensor(params);
+
+        const Tensor<ADataType>& a  = std::get<0>(host_tensors);
+        const Tensor<BDataType>& b  = std::get<1>(host_tensors);
+        Tensor<CDataType>& c_host   = std::get<2>(host_tensors);
+        Tensor<CDataType>& c_device = std::get<3>(host_tensors);
+
+        auto a_element_op = AElementwiseOperation{};
+        auto b_element_op = BElementwiseOperation{};
+        auto c_element_op = CElementwiseOperation{};
+
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                      BDataType,
+                                                      CDataType,
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation>;
+        ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>(
+            a, b, c_host, a_element_op, b_element_op, c_element_op);
+
+        // Act
+        ck::gemm_util::RunDeviceGEMM(
+            gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op);
+
+        // Assert
+        bool res = false;
+        if(std::is_same<CDataType, float>::value)
+        {
+            res = ck::utils::check_err(c_device.mData, c_host.mData);
+            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+        }
+        else if(std::is_same<CDataType, ck::half_t>::value)
+        {
+            res = ck::utils::check_err(c_device.mData, c_host.mData);
+            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+        }
+        else if(std::is_same<CDataType, int8_t>::value)
+        {
+            res = ck::utils::check_err(c_device.mData, c_host.mData);
+            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+        }
+
+        return res;
+    }
+};
+
+template <typename DeviceGemmPtr_,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct TestGemmBF16
+{
+    using BF16 = ck::bhalf_t;
+
+    auto PrepareGemmTensorBF16(const ck::gemm_util::GemmParams& params)
+    {
+        auto f_host_tensor_descriptor =
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+                {
+                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                                std::vector<std::size_t>({stride, 1}));
+                }
+                else
+                {
+                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                                std::vector<std::size_t>({1, stride}));
+                }
+            };
+
+        // use fp32 host kernel to verify bf16 device kernel
+        Tensor<BF16> a_m_k_bf16(
+            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
+        Tensor<BF16> b_k_n_bf16(
+            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
+        Tensor<BF16> c_m_n_device_bf16(
+            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+
+        Tensor<float> a_m_k_fp32(
+            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
+        Tensor<float> b_k_n_fp32(
+            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
+        Tensor<float> c_m_n_host_fp32(
+            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+        Tensor<float> c_m_n_device_fp32(
+            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+
+        a_m_k_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5});
+        b_k_n_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5});
+
+        bf16_to_f32_(a_m_k_bf16, a_m_k_fp32);
+        bf16_to_f32_(b_k_n_bf16, b_k_n_fp32);
+
+        return std::make_tuple(a_m_k_bf16,
+                               b_k_n_bf16,
+                               c_m_n_device_bf16,
+                               a_m_k_fp32,
+                               b_k_n_fp32,
+                               c_m_n_host_fp32,
+                               c_m_n_device_fp32);
+    }
+
+    auto operator()(DeviceGemmPtr_& gemmPtr)
+    {
+        // Arrange
+        ck::gemm_util::GemmParams params;
+        params.M       = 1024;
+        params.N       = 1024;
+        params.K       = 1024;
+        params.StrideA = 1024;
+        params.StrideB = 1024;
+        params.StrideC = 1024;
+
+        auto host_tensors            = PrepareGemmTensorBF16(params);
+        const Tensor<BF16>& a_bf16   = std::get<0>(host_tensors);
+        const Tensor<BF16>& b_bf16   = std::get<1>(host_tensors);
+        Tensor<BF16>& c_device_bf16  = std::get<2>(host_tensors);
+        Tensor<float>& a_fp32        = std::get<3>(host_tensors);
+        Tensor<float>& b_fp32        = std::get<4>(host_tensors);
+        Tensor<float>& c_host_fp32   = std::get<5>(host_tensors);
+        Tensor<float>& c_device_fp32 = std::get<6>(host_tensors);
+
+        auto a_element_op = AElementwiseOperation{};
+        auto b_element_op = BElementwiseOperation{};
+        auto c_element_op = CElementwiseOperation{};
+
+        // use fp32 host kernel to verify bf16 device kernel
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceGemm<float,
+                                                      float,
+                                                      float,
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation>;
+        ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>(
+            a_fp32, b_fp32, c_host_fp32, a_element_op, b_element_op, c_element_op);
+
+        // Act
+        ck::gemm_util::RunDeviceGEMM(gemmPtr,
+                                     params,
+                                     a_bf16,
+                                     b_bf16,
+                                     c_device_bf16,
+                                     a_element_op,
+                                     b_element_op,
+                                     c_element_op);
+
+        bf16_to_f32_(c_device_bf16, c_device_fp32);
+
+        // Assert
+        bool res = ck::utils::check_err(
+            c_device_fp32.mData, c_host_fp32.mData, "Error: incorrect results!", 1e-2f, 1e-3f);
+        std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+
+        return res;
+    };
+};
+
+} // namespace gemm_util
+} // namespace ck
+#endif
--- a/test/gemm_reduce/gemm_reduce_fp16.cpp
+++ b/test/gemm_reduce/gemm_reduce_fp16.cpp
@@ -16,22 +16,22 @@ int main()
    pass = pass &&
           ck::profiler::
               profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Row, Row, Row>(
-                   true, 1, false, 1, M, N, K, K, N, N);
+                   true, 1, false, false, M, N, K, K, N, N);

    pass = pass &&
           ck::profiler::
               profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Row, Col, Row>(
-                   true, 1, false, 1, M, N, K, K, K, N);
+                   true, 1, false, false, M, N, K, K, K, N);

    pass = pass &&
           ck::profiler::
               profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Col, Row, Row>(
-                   true, 1, false, 1, M, N, K, M, N, N);
+                   true, 1, false, false, M, N, K, M, N, N);

    pass = pass &&
           ck::profiler::
               profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Col, Col, Row>(
-                   true, 1, false, 1, M, N, K, M, K, N);
+                   true, 1, false, false, M, N, K, M, K, N);

    if(pass)
    {

--- a/test/gemm_split_k/gemm_split_k.cpp
+++ b/test/gemm_split_k/gemm_split_k.cpp
@@ -45,7 +45,7 @@ static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
 {
    float max_diff = 1e-6;

-    for(int i = 0; i < ref.mData.size(); ++i)
+    for(std::size_t i = 0; i < ref.mData.size(); ++i)
    {
        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
        if(max_diff < diff)
@@ -187,9 +187,10 @@ int test_gemm(const gemmArgs& args)

        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
        {
-            invoker_ptr->Run(argument_ptr.get(), 0);
+            invoker_ptr->Run(argument_ptr.get());

            c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
            if(!check_out(c_m_n_host_result, c_m_n_device_result))
            {
                success = false;

--- a/test/grouped_gemm/grouped_gemm_fp16.cpp
+++ b/test/grouped_gemm/grouped_gemm_fp16.cpp
@@ -104,7 +104,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
    b_tensors_device.reserve(group_count);
    c_tensors_device.reserve(group_count);

-    for(int i = 0; i < gemm_shapes.size(); i++)
+    for(std::size_t i = 0; i < gemm_shapes.size(); i++)
    {
        a_tensors.emplace_back(Tensor<ADataType>(f_host_tensor_descriptor(
            gemm_shapes[i].M, gemm_shapes[i].K, gemm_shapes[i].StrideA, ALayout{})));
@@ -119,7 +119,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
        b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
    }

-    for(int i = 0; i < gemm_shapes.size(); i++)
+    for(std::size_t i = 0; i < gemm_shapes.size(); i++)
    {
        a_tensors_device.emplace_back(
            std::make_unique<DeviceMem>(sizeof(ADataType) * a_tensors[i].mDesc.GetElementSize()));
@@ -147,7 +147,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)

    invoker_ptr->Run(argument_ptr.get());

-    for(int i = 0; i < gemm_shapes.size(); i++)
+    for(std::size_t i = 0; i < gemm_shapes.size(); i++)
    {
        c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data());


--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
@@ -460,7 +460,7 @@ class SimpleAppArgs

    int processArgs(int argc, char* argv[])
    {
-        unsigned int ch;
+        int ch;

        while(1)
        {

--- a/test/reduce/reduce_util.hpp
+++ b/test/reduce/reduce_util.hpp
@@ -9,7 +9,7 @@ namespace reduce_util {
 template <typename T>
 void to_f32_vector(const Tensor<T>& src, Tensor<float>& dst)
 {
-    for(int i = 0; i < src.mData.size(); ++i)
+    for(std::size_t i = 0; i < src.mData.size(); ++i)
        dst.mData[i] = type_convert<float>(src.mData[i]);
 }


--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
@@ -463,7 +463,7 @@ class SimpleAppArgs

    int processArgs(int argc, char* argv[])
    {
-        unsigned int ch;
+        int ch;

        while(1)
        {

--- a/test/reference_conv_fwd/CMakeLists.txt
+++ b/test/reference_conv_fwd/CMakeLists.txt
-add_test_executable(test_reference_conv_fwd reference_conv_fwd.cpp)
-target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor conv_fwd_util)
+add_gtest_executable(test_reference_conv_fwd reference_conv_fwd.cpp)
+target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor conv_util)
--- a/test/reference_conv_fwd/reference_conv_fwd.cpp
+++ b/test/reference_conv_fwd/reference_conv_fwd.cpp
@@ -4,10 +4,11 @@
 #include <numeric>
 #include <type_traits>
 #include <vector>
+#include "gtest/gtest.h"

 #include "check_err.hpp"
 #include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 #include "element_wise_operation.hpp"
 #include "fill.hpp"
 #include "host_tensor.hpp"
@@ -33,21 +34,21 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParams& params,
                                  const FillInputOp& fill_input_op     = FillInputOp{},
                                  const FillWeightsOp& fill_weights_op = FillWeightsOp{0.5f})
 {
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
-                                        static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
+                                        static_cast<std::size_t>(params.C_)};
    input_dims.insert(std::end(input_dims),
-                      std::begin(params.input_spatial_lengths),
-                      std::end(params.input_spatial_lengths));
+                      std::begin(params.input_spatial_lengths_),
+                      std::end(params.input_spatial_lengths_));

-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
-                                         static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
+                                         static_cast<std::size_t>(params.C_)};
    filter_dims.insert(std::end(filter_dims),
-                       std::begin(params.filter_spatial_lengths),
-                       std::end(params.filter_spatial_lengths));
+                       std::begin(params.filter_spatial_lengths_),
+                       std::end(params.filter_spatial_lengths_));

    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
-                                         static_cast<std::size_t>(params.K)};
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
+                                         static_cast<std::size_t>(params.K_)};
    output_dims.insert(std::end(output_dims),
                       std::begin(output_spatial_lengths),
                       std::end(output_spatial_lengths));
@@ -73,32 +74,32 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParams& params,
    auto ref_argument = ref_conv.MakeArgument(input,
                                              weights,
                                              host_output,
-                                              params.conv_filter_strides,
-                                              params.conv_filter_dilations,
-                                              params.input_left_pads,
-                                              params.input_right_pads,
+                                              params.conv_filter_strides_,
+                                              params.conv_filter_dilations_,
+                                              params.input_left_pads_,
+                                              params.input_right_pads_,
                                              InElementOp{},
                                              WeiElementOp{},
                                              OutElementOp{});

    ref_invoker.Run(ref_argument);
-    // std::cout <<"output: " << host_output.mDesc << std::endl << host_output.mData << std::endl;
    return host_output;
 }

-bool test_conv2d_nhwc()
+} // anonymous namespace
+
+TEST(ReferenceConvolutionFWD, Conv2DNHWC)
 {
-    bool res{true};
    ck::utils::conv::ConvParams params;
-    params.N                      = 1;
-    params.K                      = 1;
-    params.C                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{6, 6};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1, 1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1};
-    params.input_left_pads        = std::vector<ck::index_t>{0, 0};
-    params.input_right_pads       = std::vector<ck::index_t>{0, 0};
+    params.N_                      = 1;
+    params.K_                      = 1;
+    params.C_                      = 2;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{6, 6};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1, 1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1};
+    params.input_left_pads_        = std::vector<ck::index_t>{0, 0};
+    params.input_right_pads_       = std::vector<ck::index_t>{0, 0};

    auto out_tensor = run_reference_convolution_forward<2>(params);
    std::vector<std::size_t> ref_dims{1, 1, 4, 4};
@@ -118,51 +119,50 @@ bool test_conv2d_nhwc()
                                472.5,
                                490.5,
                                508.5};
-    res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
-                                      ref_dims,
-                                      "Error: wrong output tensor dimensions!");
-    res = res && ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
+    EXPECT_TRUE(ck::utils::check_err(
+        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
+}

-    params.N                      = 1;
-    params.K                      = 2;
-    params.C                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{12, 12};
-    params.conv_filter_strides    = std::vector<ck::index_t>{2, 2};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{2, 2};
-    params.input_left_pads        = std::vector<ck::index_t>{1, 1};
-    params.input_right_pads       = std::vector<ck::index_t>{1, 1};
+TEST(ReferenceConvolutionFWD, Conv2DNHWCStridesDilationsPadding)
+{
+    ck::utils::conv::ConvParams params;
+    params.N_                      = 1;
+    params.K_                      = 2;
+    params.C_                      = 2;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{12, 12};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{2, 2};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{2, 2};
+    params.input_left_pads_        = std::vector<ck::index_t>{1, 1};
+    params.input_right_pads_       = std::vector<ck::index_t>{1, 1};

-    out_tensor = run_reference_convolution_forward<2>(params);
-    ref_dims   = std::vector<std::size_t>{1, 2, 5, 5};
-    ref_data   = std::vector<float>{
+    auto out_tensor                   = run_reference_convolution_forward<2>(params);
+    std::vector<std::size_t> ref_dims = std::vector<std::size_t>{1, 2, 5, 5};
+    std::vector<float> ref_data{
        210.,  210.,  327.,   327.,   351.,   351.,   375.,   375.,   399.,   399.,
        459.,  459.,  706.5,  706.5,  742.5,  742.5,  778.5,  778.5,  814.5,  814.5,
        747.,  747.,  1138.5, 1138.5, 1174.5, 1174.5, 1210.5, 1210.5, 1246.5, 1246.5,
        1035., 1035., 1570.5, 1570.5, 1606.5, 1606.5, 1642.5, 1642.5, 1678.5, 1678.5,
        1323., 1323., 2002.5, 2002.5, 2038.5, 2038.5, 2074.5, 2074.5, 2110.5, 2110.5};
-    res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
-                                      ref_dims,
-                                      "Error: wrong output tensor dimensions!");
-    res = res && ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
-
-    return res;
+    EXPECT_TRUE(ck::utils::check_err(
+        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
 }

-bool test_conv1d_nwc()
+TEST(ReferenceConvolutionFWD, Conv1DNWC)
 {
-    bool res{true};
    ck::utils::conv::ConvParams params;
-    params.num_dim_spatial        = 1;
-    params.N                      = 1;
-    params.K                      = 1;
-    params.C                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{6};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1};
-    params.input_left_pads        = std::vector<ck::index_t>{0};
-    params.input_right_pads       = std::vector<ck::index_t>{0};
+    params.num_dim_spatial_        = 1;
+    params.N_                      = 1;
+    params.K_                      = 1;
+    params.C_                      = 2;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{6};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1};
+    params.input_left_pads_        = std::vector<ck::index_t>{0};
+    params.input_right_pads_       = std::vector<ck::index_t>{0};

    auto out_tensor =
        run_reference_convolution_forward<1,
@@ -174,46 +174,53 @@ bool test_conv1d_nwc()
                                          ck::tensor_layout::convolution::NWK>(params);
    std::vector<std::size_t> ref_dims{1, 1, 4};
    std::vector<float> ref_data{7.5, 13.5, 19.5, 25.5};
-    res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
-                                      ref_dims,
-                                      "Error: wrong output tensor dimensions!");
-    res = res && ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
+    EXPECT_TRUE(ck::utils::check_err(
+        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
+}

-    params.num_dim_spatial        = 1;
-    params.N                      = 1;
-    params.K                      = 2;
-    params.C                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{12};
-    params.conv_filter_strides    = std::vector<ck::index_t>{2};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{2};
-    params.input_left_pads        = std::vector<ck::index_t>{1};
-    params.input_right_pads       = std::vector<ck::index_t>{1};
+TEST(ReferenceConvolutionFWD, Conv1DNWCStridesDilationsPadding)
+{
+    ck::utils::conv::ConvParams params;
+    params.num_dim_spatial_        = 1;
+    params.N_                      = 1;
+    params.K_                      = 2;
+    params.C_                      = 2;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{12};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{2};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{2};
+    params.input_left_pads_        = std::vector<ck::index_t>{1};
+    params.input_right_pads_       = std::vector<ck::index_t>{1};

-    out_tensor = run_reference_convolution_forward<1,
-                                                   float,
-                                                   float,
-                                                   float,
-                                                   ck::tensor_layout::convolution::NWC,
-                                                   ck::tensor_layout::convolution::KXC,
-                                                   ck::tensor_layout::convolution::NWK>(params);
-    ref_dims   = std::vector<std::size_t>{1, 2, 5};
-    ref_data   = std::vector<float>{9., 9., 19.5, 19.5, 31.5, 31.5, 43.5, 43.5, 55.5, 55.5};
-    res        = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
-                                      ref_dims,
-                                      "Error: wrong output tensor dimensions!");
-    res = res && ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
+    auto out_tensor =
+        run_reference_convolution_forward<1,
+                                          float,
+                                          float,
+                                          float,
+                                          ck::tensor_layout::convolution::NWC,
+                                          ck::tensor_layout::convolution::KXC,
+                                          ck::tensor_layout::convolution::NWK>(params);
+    std::vector<std::size_t> ref_dims{1, 2, 5};
+    std::vector<float> ref_data{9., 9., 19.5, 19.5, 31.5, 31.5, 43.5, 43.5, 55.5, 55.5};
+    EXPECT_TRUE(ck::utils::check_err(
+        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
+}

-    params.num_dim_spatial        = 1;
-    params.N                      = 2;
-    params.K                      = 16;
-    params.C                      = 4;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{16};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1};
-    params.input_left_pads        = std::vector<ck::index_t>{1};
-    params.input_right_pads       = std::vector<ck::index_t>{1};
+TEST(ReferenceConvolutionFWD, Conv1DNWCSameOutputSize)
+{
+    ck::utils::conv::ConvParams params;
+    params.num_dim_spatial_        = 1;
+    params.N_                      = 2;
+    params.K_                      = 16;
+    params.C_                      = 4;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{16};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1};
+    params.input_left_pads_        = std::vector<ck::index_t>{1};
+    params.input_right_pads_       = std::vector<ck::index_t>{1};

    auto out_tensor2 = run_reference_convolution_forward<1,
                                                         float,
@@ -224,8 +231,8 @@ bool test_conv1d_nwc()
                                                         ck::tensor_layout::convolution::NWK>(
        params, ck::utils::FillMonotonicSeq<float>{0.f, 0.1f});

-    ref_dims = std::vector<std::size_t>{2, 16, 16};
-    ref_data = std::vector<float>{
+    std::vector<std::size_t> ref_dims{2, 16, 16};
+    std::vector<float> ref_data{
        1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,
        1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,
        3.3,       3.3,       3.3,       3.3,       3.3,       3.3,       3.3,       3.3,
@@ -290,28 +297,24 @@ bool test_conv1d_nwc()
        72.9,      72.9,      72.9,      72.9,      72.9,      72.9,      72.9,      72.9,
        49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4,
        49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4};
-    res = res && ck::utils::check_err(out_tensor2.mDesc.GetLengths(),
-                                      ref_dims,
-                                      "Error: wrong output tensor dimensions!");
-    res = res && ck::utils::check_err(out_tensor2.mData, ref_data, "Error: incorrect results!");
-
-    return res;
+    EXPECT_TRUE(ck::utils::check_err(
+        out_tensor2.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor2.mData, ref_data, "Error: incorrect results!"));
 }

-bool test_conv3d_ncdhw()
+TEST(ReferenceConvolutionFWD, Conv3DNCDHW)
 {
-    bool res{true};
    ck::utils::conv::ConvParams params;
-    params.num_dim_spatial        = 3;
-    params.N                      = 1;
-    params.K                      = 1;
-    params.C                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{6, 6, 6};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1, 1, 1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1, 1};
-    params.input_left_pads        = std::vector<ck::index_t>{0, 0, 0};
-    params.input_right_pads       = std::vector<ck::index_t>{0, 0, 0};
+    params.num_dim_spatial_        = 3;
+    params.N_                      = 1;
+    params.K_                      = 1;
+    params.C_                      = 2;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3, 3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{6, 6, 6};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1, 1, 1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1, 1};
+    params.input_left_pads_        = std::vector<ck::index_t>{0, 0, 0};
+    params.input_right_pads_       = std::vector<ck::index_t>{0, 0, 0};

    auto out_tensor = run_reference_convolution_forward<3,
                                                        float,
@@ -331,32 +334,37 @@ bool test_conv3d_ncdhw()
        634.5,     637.2,     639.9,     642.60004, 650.7,     653.4,     656.10004, 658.8,
        699.3,     702.,      704.7,     707.4,     715.5,     718.2,     720.9,     723.60004,
        731.7,     734.4001,  737.10004, 739.8,     747.9001,  750.60004, 753.3,     756.};
-    res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
-                                      ref_dims,
-                                      "Error [case 1]: wrong output tensor dimensions!");
-    res = res &&
-          ck::utils::check_err(out_tensor.mData, ref_data, "Error [case 1]: incorrect results!");
+    EXPECT_TRUE(ck::utils::check_err(out_tensor.mDesc.GetLengths(),
+                                     ref_dims,
+                                     "Error [case 1]: wrong output tensor dimensions!"));
+    EXPECT_TRUE(
+        ck::utils::check_err(out_tensor.mData, ref_data, "Error [case 1]: incorrect results!"));
+}

-    params.N                      = 1;
-    params.K                      = 2;
-    params.C                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{12, 12, 12};
-    params.conv_filter_strides    = std::vector<ck::index_t>{3, 3, 3};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1, 1};
-    params.input_left_pads        = std::vector<ck::index_t>{0, 0, 0};
-    params.input_right_pads       = std::vector<ck::index_t>{0, 0, 0};
+TEST(ReferenceConvolutionFWD, Conv3DNCDHWStridesDilations)
+{
+    ck::utils::conv::ConvParams params;
+    params.num_dim_spatial_        = 3;
+    params.N_                      = 1;
+    params.K_                      = 2;
+    params.C_                      = 2;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3, 3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{12, 12, 12};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{3, 3, 3};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1, 1};
+    params.input_left_pads_        = std::vector<ck::index_t>{0, 0, 0};
+    params.input_right_pads_       = std::vector<ck::index_t>{0, 0, 0};

-    out_tensor = run_reference_convolution_forward<3,
-                                                   float,
-                                                   float,
-                                                   float,
-                                                   ck::tensor_layout::convolution::NCDHW,
-                                                   ck::tensor_layout::convolution::KCZYX,
-                                                   ck::tensor_layout::convolution::NKDHW>(
+    auto out_tensor = run_reference_convolution_forward<3,
+                                                        float,
+                                                        float,
+                                                        float,
+                                                        ck::tensor_layout::convolution::NCDHW,
+                                                        ck::tensor_layout::convolution::KCZYX,
+                                                        ck::tensor_layout::convolution::NKDHW>(
        params, ck::utils::FillMonotonicSeq<float>{0.f, 0.1f});
-    ref_dims = std::vector<std::size_t>{1, 2, 4, 4, 4};
-    ref_data = std::vector<float>{
+    std::vector<std::size_t> ref_dims{1, 2, 4, 4, 4};
+    std::vector<float> ref_data{
        2756.7002, 2764.7998, 2772.9001, 2781.,     2853.9001, 2862.,     2870.1,    2878.2002,
        2951.1,    2959.2002, 2967.2998, 2975.4001, 3048.2998, 3056.4001, 3064.5,    3072.6,
        3923.1,    3931.2,    3939.2998, 3947.4,    4020.2998, 4028.4001, 4036.5002, 4044.5999,
@@ -373,26 +381,9 @@ bool test_conv3d_ncdhw()
        5283.9004, 5292.,     5300.0996, 5308.2,    5381.0996, 5389.2,    5397.3,    5405.4004,
        6255.9004, 6264.0005, 6272.1,    6280.2,    6353.1,    6361.2,    6369.301,  6377.4,
        6450.301,  6458.4,    6466.5,    6474.6,    6547.5,    6555.6,    6563.699,  6571.801};
-    res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
-                                      ref_dims,
-                                      "Error [case 2]: wrong output tensor dimensions!");
-    res =
-        res && ck::utils::check_err(
-                   out_tensor.mData, ref_data, "Error [case 2]: incorrect results!", 1e-4f, 1e-6f);
-
-    return res;
-}
-
-} // anonymous namespace
-
-int main(void)
-{
-    bool res{true};
-    res = test_conv2d_nhwc();
-    std::cout << "test_conv2d_nhwc ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
-    res = test_conv1d_nwc();
-    std::cout << "TestConv1DNHWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
-    res = test_conv3d_ncdhw();
-    std::cout << "test_conv3d_ncdhw ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
-    return res ? 0 : 1;
+    EXPECT_TRUE(ck::utils::check_err(out_tensor.mDesc.GetLengths(),
+                                     ref_dims,
+                                     "Error [case 2]: wrong output tensor dimensions!"));
+    EXPECT_TRUE(ck::utils::check_err(
+        out_tensor.mData, ref_data, "Error [case 2]: incorrect results!", 1e-4f, 1e-6f));
 }