Commit 930b2872 authored by Harisankar Sadasivan's avatar Harisankar Sadasivan
Browse files

best performing kernel for GEMV codex problem with M=1 with inverted B matrix

parents a1e17d18 a4f72a31
...@@ -2,25 +2,28 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) ...@@ -2,25 +2,28 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set(target 0) set(target 0)
foreach(gpu IN LISTS GPU_TARGETS) foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0) if(gpu IN_LIST gpu_list AND target EQUAL 0)
if(DTYPES MATCHES "fp16" OR DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) add_custom_target(test_batched_gemm_softmax_gemm_permute)
add_custom_target(test_batched_gemm_softmax_gemm_permute) add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
endif() if(result EQUAL 0)
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp) add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_fp16 test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp) endif()
target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_fp16 test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp)
target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) if(result EQUAL 0)
add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16) target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16) add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16)
endif() endif()
if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp) add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp)
add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_bf16 test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp) if(result EQUAL 0)
target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16)
add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16) endif()
add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16) add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_bf16 test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp)
endif() if(result EQUAL 0)
target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16)
endif()
set(target 1) set(target 1)
endif() endif()
endforeach() endforeach()
\ No newline at end of file
...@@ -70,10 +70,23 @@ class TestBatchNormBwdRank4 : public ::testing::Test ...@@ -70,10 +70,23 @@ class TestBatchNormBwdRank4 : public ::testing::Test
} }
}; };
using KernelTypes = ::testing::Types<std::tuple<F16, F32, F32, F32, F16, F32, F32>, using KernelTypes = ::testing::Types<
std::tuple<F32, F32, F32, F32, F32, F32, F32>, #ifdef CK_ENABLE_FP16
std::tuple<BF16, F32, F32, F32, BF16, F32, F32>, std::tuple<F16, F32, F32, F32, F16, F32, F32>
std::tuple<F64, F64, F64, F64, F64, F64, F64>>; #endif
#ifdef CK_ENABLE_FP32
,
std::tuple<F32, F32, F32, F32, F32, F32, F32>
#endif
#ifdef CK_ENABLE_BF16
,
std::tuple<BF16, F32, F32, F32, BF16, F32, F32>
#endif
#ifdef CK_ENABLE_FP64
,
std::tuple<F64, F64, F64, F64, F64, F64, F64>
#endif
>;
TYPED_TEST_SUITE(TestBatchNormBwdRank4, KernelTypes); TYPED_TEST_SUITE(TestBatchNormBwdRank4, KernelTypes);
......
...@@ -87,10 +87,23 @@ class TestBatchNormFwdRank4 : public ::testing::Test ...@@ -87,10 +87,23 @@ class TestBatchNormFwdRank4 : public ::testing::Test
} }
}; };
using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, F32>, using KernelTypes = ::testing::Types<
std::tuple<F32, F32, F32, F32, F32, F32>, #ifdef CK_ENABLE_FP16
std::tuple<BF16, BF16, F32, BF16, BF16, F32>, std::tuple<F16, F16, F32, F16, F16, F32>
std::tuple<F64, F64, F64, F64, F64, F64>>; #endif
#ifdef CK_ENABLE_FP32
,
std::tuple<F32, F32, F32, F32, F32, F32>
#endif
#ifdef CK_ENABLE_BF16
,
std::tuple<BF16, BF16, F32, BF16, BF16, F32>
#endif
#ifdef CK_ENABLE_FP64
,
std::tuple<F64, F64, F64, F64, F64, F64>
#endif
>;
TYPED_TEST_SUITE(TestBatchNormFwdRank4, KernelTypes); TYPED_TEST_SUITE(TestBatchNormFwdRank4, KernelTypes);
......
...@@ -67,10 +67,23 @@ class TestBatchNormInferRank4 : public ::testing::Test ...@@ -67,10 +67,23 @@ class TestBatchNormInferRank4 : public ::testing::Test
} }
}; };
using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, F32>, using KernelTypes = ::testing::Types<
std::tuple<F32, F32, F32, F32, F32, F32>, #ifdef CK_ENABLE_FP16
std::tuple<BF16, BF16, F32, BF16, BF16, F32>, std::tuple<F16, F16, F32, F16, F16, F32>
std::tuple<F64, F64, F64, F64, F64, F64>>; #endif
#ifdef CK_ENABLE_FP32
,
std::tuple<F32, F32, F32, F32, F32, F32>
#endif
#ifdef CK_ENABLE_BF16
,
std::tuple<BF16, BF16, F32, BF16, BF16, F32>
#endif
#ifdef CK_ENABLE_FP64
,
std::tuple<F64, F64, F64, F64, F64, F64>
#endif
>;
TYPED_TEST_SUITE(TestBatchNormInferRank4, KernelTypes); TYPED_TEST_SUITE(TestBatchNormInferRank4, KernelTypes);
......
add_gtest_executable(test_conv_tensor_rearrange test_conv_tensor_rearrange.cpp)
target_link_libraries(test_conv_tensor_rearrange PRIVATE utility device_image_to_column_instance device_column_to_image_instance)
add_gtest_executable(test_conv_tensor_rearrange_interface test_conv_tensor_rearrange_interface.cpp)
target_link_libraries(test_conv_tensor_rearrange_interface PRIVATE utility)
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <tuple>
#include <vector>
#include <gtest/gtest.h>
#include "profiler/profile_conv_tensor_rearrange_impl.hpp"
template <typename Tuple>
class TestConvTensorRearrange : public ::testing::Test
{
protected:
using ImLayout = std::tuple_element_t<0, Tuple>;
using ConvTensorRearrangeOp = std::tuple_element_t<1, Tuple>;
std::vector<ck::utils::conv::ConvParam> conv_params;
template <ck::index_t NDimSpatial, typename InDataType, typename OutDataType>
void Run()
{
EXPECT_FALSE(conv_params.empty());
bool pass = true;
for(auto& param : conv_params)
{
pass = pass && ck::profiler::profile_conv_tensor_rearrange_impl<NDimSpatial,
ImLayout,
InDataType,
OutDataType,
ConvTensorRearrangeOp>(
true, // do_verification
1, // init_method: integer value
false, // do_log
false, // time_kernel
param);
}
EXPECT_TRUE(pass);
}
};
using namespace ck::tensor_layout::convolution;
using namespace ck::conv_tensor_rearrange_op;
using KernelTypes1d =
::testing::Types<std::tuple<GNWC, ImageToColumn>, std::tuple<GNWC, ColumnToImage>>;
using KernelTypes2d =
::testing::Types<std::tuple<GNHWC, ImageToColumn>, std::tuple<GNHWC, ColumnToImage>>;
using KernelTypes3d =
::testing::Types<std::tuple<GNDHWC, ImageToColumn>, std::tuple<GNDHWC, ColumnToImage>>;
template <typename Tuple>
class TestConvTensorRearrange1d : public TestConvTensorRearrange<Tuple>
{
};
template <typename Tuple>
class TestConvTensorRearrange2d : public TestConvTensorRearrange<Tuple>
{
};
template <typename Tuple>
class TestConvTensorRearrange3d : public TestConvTensorRearrange<Tuple>
{
};
TYPED_TEST_SUITE(TestConvTensorRearrange1d, KernelTypes1d);
TYPED_TEST_SUITE(TestConvTensorRearrange2d, KernelTypes2d);
TYPED_TEST_SUITE(TestConvTensorRearrange3d, KernelTypes3d);
TYPED_TEST(TestConvTensorRearrange1d, Test1D)
{
this->conv_params.clear();
this->conv_params.push_back({1, 1, 4, 1, 192, {3}, {28}, {1}, {1}, {1}, {1}});
this->conv_params.push_back({1, 1, 64, 1, 64, {3}, {14}, {1}, {1}, {1}, {1}});
this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {7}, {3}, {1}, {0}, {0}});
this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {3}, {1}, {1}, {0}, {0}});
// ScalarPerVector should be 1
this->conv_params.push_back({1, 1, 4, 1, 1, {3}, {28}, {1}, {1}, {1}, {1}});
// stride != 1
this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {2}, {1}, {1}, {1}});
// dilation != 1
this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {1}, {2}, {1}, {1}});
#ifdef CK_ENABLE_FP32
this->template Run<1, float, float>();
#endif
#ifdef CK_ENABLE_BF16
this->template Run<1, ck::bhalf_t, ck::bhalf_t>();
#endif
#ifdef CK_ENABLE_FP16
this->template Run<1, ck::half_t, ck::half_t>();
#endif
#ifdef CK_ENABLE_INT8
this->template Run<1, int8_t, int8_t>();
#endif
}
TYPED_TEST(TestConvTensorRearrange2d, Test2D)
{
this->conv_params.clear();
this->conv_params.push_back(
{2, 1, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back(
{2, 1, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {7, 7}, {3, 3}, {1, 1}, {0, 0}, {0, 0}});
this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
this->conv_params.push_back(
{2, 1, 64, 1, 64, {3, 3}, {28, 28}, {2, 2}, {2, 2}, {1, 1}, {1, 1}});
#ifdef CK_ENABLE_FP32
this->template Run<2, float, float>();
#endif
#ifdef CK_ENABLE_BF16
this->template Run<2, ck::bhalf_t, ck::bhalf_t>();
#endif
#ifdef CK_ENABLE_FP16
this->template Run<2, ck::half_t, ck::half_t>();
#endif
#ifdef CK_ENABLE_INT8
this->template Run<2, int8_t, int8_t>();
#endif
}
TYPED_TEST(TestConvTensorRearrange3d, Test3D)
{
this->conv_params.clear();
this->conv_params.push_back(
{3, 1, 16, 1, 64, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {3, 3, 3}, {0, 0, 0}, {0, 0, 0}});
this->conv_params.push_back(
{3, 1, 2, 1, 64, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back(
{3, 1, 32, 1, 64, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
this->conv_params.push_back(
{3, 1, 64, 1, 64, {3, 3, 3}, {14, 14, 14}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}});
#ifdef CK_ENABLE_FP32
this->template Run<3, float, float>();
#endif
#ifdef CK_ENABLE_BF16
this->template Run<3, ck::bhalf_t, ck::bhalf_t>();
#endif
#ifdef CK_ENABLE_FP16
this->template Run<3, ck::half_t, ck::half_t>();
#endif
#ifdef CK_ENABLE_INT8
this->template Run<3, int8_t, int8_t>();
#endif
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <tuple>
#include <vector>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include <gtest/gtest.h>
using DataType = float;
using ImLayout = ck::tensor_layout::convolution::GNWC;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using namespace ck::conv_tensor_rearrange_op;
template <ck::index_t ScalarPerVector, bool IsCPacked>
class TestConvTensorRearrangeInterface : public ::testing::Test
{
protected:
static constexpr ck::index_t NDimSpatial = 1;
// clang-format off
using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl
// Num| ImLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
// Dim| | | | Size| Block| Block| Cluster| Per|
// Spatial| | | | | | | Lengths| Vector|
// | | | | | | | | |
< NDimSpatial, ImLayout, DataType, DataType, 256, 128, 128, S<16, 16>,ScalarPerVector>;
using DeviceColToimgInstance = ck::tensor_operation::device::DeviceColumnToImageImpl
// Num| ImLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
// Dim| | | | Size| Block| Block| Cluster| Per|
// Spatial| | | | | | | Lengths| Vector|
// | | | | | | | | |
< NDimSpatial, ImLayout, DataType, DataType, 256, 128, 128, S<16, 16>,ScalarPerVector>;
// clang-format on
ck::utils::conv::ConvParam conv_param;
template <typename ConvTensorRearrangeOp>
bool Run()
{
const auto N = conv_param.N_;
const auto C = conv_param.C_;
const auto FakeC =
conv_param.C_ / 2; // Fake C to simulate the behavior that C is not packed
const ck::index_t NDoHoWo =
N *
ck::accumulate_n<ck::index_t>(
conv_param.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
const ck::index_t CZYX =
C *
ck::accumulate_n<ck::index_t>(
conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
const auto image_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(
conv_param);
const auto gemm_desc = HostTensorDescriptor({NDoHoWo, CZYX});
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
std::array<ck::index_t, 2> output_m_k_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
copy(conv_param.input_spatial_lengths_, input_spatial_lengths);
copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths);
copy(conv_param.output_spatial_lengths_, output_spatial_lengths);
copy(image_desc.GetStrides(), input_g_n_c_wis_strides);
copy(gemm_desc.GetStrides(), output_m_k_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);
if constexpr(std::is_same_v<ConvTensorRearrangeOp, ImageToColumn>)
{
auto img2col = DeviceImgToColInstance{};
auto argument = img2col.MakeArgument(nullptr,
nullptr,
N,
IsCPacked ? C : FakeC,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
input_g_n_c_wis_strides,
output_m_k_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
return img2col.IsSupportedArgument(argument);
}
else if constexpr(std::is_same_v<ConvTensorRearrangeOp, ColumnToImage>)
{
auto col2img = DeviceColToimgInstance{};
auto argument = col2img.MakeArgument(nullptr,
nullptr,
N,
IsCPacked ? C : FakeC,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
input_g_n_c_wis_strides,
output_m_k_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
return col2img.IsSupportedArgument(argument);
}
}
};
class TestConvTensorRearrangeInterface1ScalarPerVector
: public TestConvTensorRearrangeInterface<1, true>
{
};
class TestConvTensorRearrangeInterface4ScalarPerVector
: public TestConvTensorRearrangeInterface<4, true>
{
};
class TestConvTensorRearrangeInterface4ScalarPerVectorFakeC
: public TestConvTensorRearrangeInterface<4, false>
{
};
TEST_F(TestConvTensorRearrangeInterface1ScalarPerVector, X1ScalarPerVector)
{
// vector load C * X % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}};
bool is_supported = this->template Run<ImageToColumn>();
EXPECT_TRUE(is_supported);
is_supported = this->template Run<ColumnToImage>();
EXPECT_TRUE(is_supported);
// vector load C * left_pad_x % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}};
is_supported = this->template Run<ImageToColumn>();
EXPECT_TRUE(is_supported);
is_supported = this->template Run<ColumnToImage>();
EXPECT_TRUE(is_supported);
// vector load C * right_pad_x % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}};
is_supported = this->template Run<ImageToColumn>();
EXPECT_TRUE(is_supported);
is_supported = this->template Run<ColumnToImage>();
EXPECT_TRUE(is_supported);
// vector load C % ScalarPerVector, right_pad and stride
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}};
is_supported = this->template Run<ImageToColumn>();
EXPECT_TRUE(is_supported);
is_supported = this->template Run<ColumnToImage>();
EXPECT_TRUE(is_supported);
// vector load C % ScalarPerVector, left_pad and stride
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}};
is_supported = this->template Run<ImageToColumn>();
EXPECT_TRUE(is_supported);
is_supported = this->template Run<ColumnToImage>();
EXPECT_TRUE(is_supported);
// vector load C % ScalarPerVector, dilation
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
is_supported = this->template Run<ImageToColumn>();
EXPECT_TRUE(is_supported);
is_supported = this->template Run<ColumnToImage>();
EXPECT_TRUE(is_supported);
// C = 4
this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}};
is_supported = this->template Run<ImageToColumn>();
EXPECT_TRUE(is_supported);
is_supported = this->template Run<ColumnToImage>();
EXPECT_TRUE(is_supported);
}
TEST_F(TestConvTensorRearrangeInterface4ScalarPerVector, X4ScalarPerVector)
{
// vector load C * X % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}};
bool is_supported = this->template Run<ImageToColumn>();
EXPECT_FALSE(is_supported);
is_supported = this->template Run<ColumnToImage>();
EXPECT_FALSE(is_supported);
// vector load C * left_pad_x % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}};
is_supported = this->template Run<ImageToColumn>();
EXPECT_FALSE(is_supported);
is_supported = this->template Run<ColumnToImage>();
EXPECT_FALSE(is_supported);
// vector load C * right_pad_x % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}};
is_supported = this->template Run<ImageToColumn>();
EXPECT_FALSE(is_supported);
is_supported = this->template Run<ColumnToImage>();
EXPECT_FALSE(is_supported);
// vector load C % ScalarPerVector, right_pad and stride
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}};
is_supported = this->template Run<ImageToColumn>();
EXPECT_FALSE(is_supported);
is_supported = this->template Run<ColumnToImage>();
EXPECT_FALSE(is_supported);
// vector load C % ScalarPerVector, left_pad and stride
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}};
is_supported = this->template Run<ImageToColumn>();
EXPECT_FALSE(is_supported);
is_supported = this->template Run<ColumnToImage>();
EXPECT_FALSE(is_supported);
// vector load C % ScalarPerVector, dilation
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
is_supported = this->template Run<ImageToColumn>();
EXPECT_FALSE(is_supported);
is_supported = this->template Run<ColumnToImage>();
EXPECT_FALSE(is_supported);
// C = 4
this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}};
is_supported = this->template Run<ImageToColumn>();
EXPECT_TRUE(is_supported);
is_supported = this->template Run<ColumnToImage>();
EXPECT_TRUE(is_supported);
}
TEST_F(TestConvTensorRearrangeInterface4ScalarPerVectorFakeC, X4ScalarPerVectorFakeC)
{
// C = 3
this->conv_param = {1, 1, 1, 1, 3, {4}, {3}, {1}, {1}, {0}, {0}};
bool is_supported = this->template Run<ImageToColumn>();
EXPECT_FALSE(is_supported);
is_supported = this->template Run<ColumnToImage>();
EXPECT_FALSE(is_supported);
// C = 4
this->conv_param = {1, 1, 1, 1, 8, {4}, {3}, {1}, {1}, {0}, {0}};
is_supported = this->template Run<ImageToColumn>();
EXPECT_TRUE(is_supported);
is_supported = this->template Run<ColumnToImage>();
EXPECT_TRUE(is_supported);
}
if (USE_BITINT_EXTENSION_INT4) if (USE_BITINT_EXTENSION_INT4)
add_gtest_executable(test_int4 int4.cpp) add_gtest_executable(test_int4 int4.cpp)
target_link_libraries(test_int4 PRIVATE utility) if(result EQUAL 0)
target_link_libraries(test_int4 PRIVATE utility)
endif()
endif() endif()
if(DTYPES MATCHES "fp8" OR NOT DEFINED DTYPES) add_gtest_executable(test_fp8 fp8.cpp)
add_gtest_executable(test_f8 f8.cpp) if(result EQUAL 0)
target_link_libraries(test_f8 PRIVATE utility) target_link_libraries(test_fp8 PRIVATE utility)
endif() endif()
add_gtest_executable(test_bf8 bf8.cpp)
if(DTYPES MATCHES "bf8" OR NOT DEFINED DTYPES) if(result EQUAL 0)
add_gtest_executable(test_bf8 bf8.cpp)
target_link_libraries(test_bf8 PRIVATE utility) target_link_libraries(test_bf8 PRIVATE utility)
endif() endif()
add_gtest_executable(test_type_convert_const type_convert_const.cpp)
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "ck/utility/data_type.hpp"
#include "ck/utility/type_convert.hpp"
using ck::bhalf_t;
using ck::type_convert;
TEST(TypeConvertConst, ConvertToConst)
{
constexpr float bf16_epsilon = 0.0078125;
constexpr float rel_tol = 2 * bf16_epsilon;
const std::vector<float> cases = {0.0, -123.f, 3.981323f, 0.2429f};
for(float x : cases)
{
const float abs_tol = std::abs(rel_tol * x);
{
bhalf_t y = type_convert<bhalf_t>(x);
// Test non-const bhalf to const float.
const float y_float = type_convert<const float>(y);
ASSERT_NEAR(y_float, x, abs_tol);
}
{
// Test non-const float to const bhalf.
const bhalf_t y = type_convert<const bhalf_t>(x);
// Remove the constness manually to not rely on const casts anymore since the
// possible issue could hide after two casts.
bhalf_t& y_nonconst = const_cast<bhalf_t&>(y);
float y_float = type_convert<float>(y_nonconst);
ASSERT_NEAR(y_float, x, abs_tol);
}
}
}
TEST(TypeConvertConst, ConvertFromConst)
{
constexpr float bf16_epsilon = 0.0078125;
constexpr float rel_tol = 2 * bf16_epsilon;
const std::vector<float> cases = {0.0, -123.f, 3.981323f, 0.2429f};
for(const float x : cases)
{
const float abs_tol = std::abs(rel_tol * x);
{
// Test const float to const bhalf_t.
const bhalf_t y = type_convert<const bhalf_t>(x);
// Remove the constness manually to not rely on const casts anymore since the
// possible issue could hide after two casts.
bhalf_t& y_nonconst = const_cast<bhalf_t&>(y);
float y_float = type_convert<float>(y_nonconst);
ASSERT_NEAR(y_float, x, abs_tol);
}
{
// Test const float to non-const bhalf.
bhalf_t y = type_convert<bhalf_t>(x);
float y_float = type_convert<float>(y);
ASSERT_NEAR(y_float, x, abs_tol);
}
{
const bhalf_t y = type_convert<const bhalf_t>(x);
// Test const bhalf to non-const float.
float y_float = type_convert<float>(y);
ASSERT_NEAR(y_float, x, abs_tol);
}
// Tests with full type specializations for X.
{
// Test const float to const bhalf_t.
const bhalf_t y = type_convert<const bhalf_t, const float>(x);
// Remove the constness manually to not rely on const casts anymore since the
// possible issue could hide after two casts.
bhalf_t& y_nonconst = const_cast<bhalf_t&>(y);
float y_float = type_convert<float>(y_nonconst);
ASSERT_NEAR(y_float, x, abs_tol);
}
{
// Test const float to non-const bhalf.
bhalf_t y = type_convert<bhalf_t, const float>(x);
float y_float = type_convert<float>(y);
ASSERT_NEAR(y_float, x, abs_tol);
}
{
const bhalf_t y = type_convert<const bhalf_t, const float>(x);
// Test const bhalf to non-const float.
float y_float = type_convert<float, const bhalf_t>(y);
ASSERT_NEAR(y_float, x, abs_tol);
}
}
}
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_custom_target(test_elementwise_normalization)
add_custom_target(test_elementwise_normalization) add_gtest_executable(test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp)
add_gtest_executable(test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp) if(result EQUAL 0)
target_link_libraries(test_elementwise_layernorm_fp16 PRIVATE utility device_elementwise_normalization_instance) target_link_libraries(test_elementwise_layernorm_fp16 PRIVATE utility device_elementwise_normalization_instance)
add_dependencies(test_elementwise_normalization test_elementwise_layernorm_fp16) add_dependencies(test_elementwise_normalization test_elementwise_layernorm_fp16)
endif() endif()
\ No newline at end of file
if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
add_test_executable(test_gemm_fp32 gemm_fp32.cpp) add_test_executable(test_gemm_fp32 gemm_fp32.cpp)
target_link_libraries(test_gemm_fp32 PRIVATE utility) if(result EQUAL 0)
target_link_libraries(test_gemm_fp32 PRIVATE device_gemm_instance) target_link_libraries(test_gemm_fp32 PRIVATE utility device_gemm_instance)
endif() endif()
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_test_executable(test_gemm_fp16 gemm_fp16.cpp) add_test_executable(test_gemm_fp16 gemm_fp16.cpp)
target_link_libraries(test_gemm_fp16 PRIVATE utility) if(result EQUAL 0)
target_link_libraries(test_gemm_fp16 PRIVATE device_gemm_instance) target_link_libraries(test_gemm_fp16 PRIVATE utility device_gemm_instance)
add_library(gemm_standalone_xdl_fp16_instances STATIC add_library(gemm_standalone_xdl_fp16_instances STATIC
instance/gemm_f16_nn_instance.cpp instance/gemm_f16_nn_instance.cpp
instance/gemm_f16_nt_instance.cpp instance/gemm_f16_nt_instance.cpp
instance/gemm_f16_tn_instance.cpp instance/gemm_f16_tn_instance.cpp
instance/gemm_wavelet_f16_tn_instance.cpp instance/gemm_wavelet_f16_tn_instance.cpp
instance/gemm_f16_tt_instance.cpp instance/gemm_f16_tt_instance.cpp
) )
endif()
add_test_executable(test_gemm_standalone_xdl_fp16 gemm_standalone_xdl_fp16.cpp) add_test_executable(test_gemm_standalone_xdl_fp16 gemm_standalone_xdl_fp16.cpp)
target_link_libraries(test_gemm_standalone_xdl_fp16 PRIVATE gemm_standalone_xdl_fp16_instances utility) if(result EQUAL 0)
target_include_directories(test_gemm_standalone_xdl_fp16 PRIVATE instance/) target_link_libraries(test_gemm_standalone_xdl_fp16 PRIVATE gemm_standalone_xdl_fp16_instances utility)
target_include_directories(test_gemm_standalone_xdl_fp16 PRIVATE instance/)
endif() endif()
if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
add_test_executable(test_gemm_bf16 gemm_bf16.cpp) add_test_executable(test_gemm_bf16 gemm_bf16.cpp)
target_link_libraries(test_gemm_bf16 PRIVATE utility) if(result EQUAL 0)
target_link_libraries(test_gemm_bf16 PRIVATE device_gemm_instance) target_link_libraries(test_gemm_bf16 PRIVATE utility device_gemm_instance)
endif() endif()
if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
add_test_executable(test_gemm_int8 gemm_int8.cpp) add_test_executable(test_gemm_int8 gemm_int8.cpp)
target_link_libraries(test_gemm_int8 PRIVATE utility) if(result EQUAL 0)
target_link_libraries(test_gemm_int8 PRIVATE device_gemm_instance) target_link_libraries(test_gemm_int8 PRIVATE utility device_gemm_instance)
endif() endif()
\ No newline at end of file
...@@ -2,12 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) ...@@ -2,12 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set(target 0) set(target 0)
foreach(gpu IN LISTS GPU_TARGETS) foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0) if(gpu IN_LIST gpu_list AND target EQUAL 0)
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_custom_target(test_gemm_layernorm) add_custom_target(test_gemm_layernorm)
add_gtest_executable(test_gemm_add_relu_add_layernorm_fp16 test_gemm_add_relu_add_layernorm_fp16.cpp) add_gtest_executable(test_gemm_add_relu_add_layernorm_fp16 test_gemm_add_relu_add_layernorm_fp16.cpp)
target_link_libraries(test_gemm_add_relu_add_layernorm_fp16 PRIVATE utility device_gemm_add_relu_add_layernorm_instance) if(result EQUAL 0)
add_dependencies(test_gemm_layernorm test_gemm_add_relu_add_layernorm_fp16) target_link_libraries(test_gemm_add_relu_add_layernorm_fp16 PRIVATE utility device_gemm_add_relu_add_layernorm_instance)
set(target 1) add_dependencies(test_gemm_layernorm test_gemm_add_relu_add_layernorm_fp16)
endif() set(target 1)
endif()
endif() endif()
endforeach() endforeach()
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_test_executable(test_gemm_reduce_fp16 gemm_reduce_fp16.cpp)
add_test_executable(test_gemm_reduce_fp16 gemm_reduce_fp16.cpp) if(result EQUAL 0)
target_link_libraries(test_gemm_reduce_fp16 PRIVATE utility) target_link_libraries(test_gemm_reduce_fp16 PRIVATE utility device_gemm_reduce_instance)
target_link_libraries(test_gemm_reduce_fp16 PRIVATE device_gemm_reduce_instance)
endif() endif()
\ No newline at end of file
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
TYPED_TEST(TestGemmSplitK_MK_KN, SmallM) TYPED_TEST(TestGemmSplitK_MK_KN, SmallM)
{ {
std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6}; std::vector<int> Ms{1, 2, 3, 4, 5, 6};
constexpr int N = 512; constexpr int N = 512;
constexpr int K = 320; constexpr int K = 320;
...@@ -16,7 +16,7 @@ TYPED_TEST(TestGemmSplitK_MK_KN, SmallM) ...@@ -16,7 +16,7 @@ TYPED_TEST(TestGemmSplitK_MK_KN, SmallM)
TYPED_TEST(TestGemmSplitK_MK_NK, SmallM) TYPED_TEST(TestGemmSplitK_MK_NK, SmallM)
{ {
std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6}; std::vector<int> Ms{1, 2, 3, 4, 5, 6};
constexpr int N = 512; constexpr int N = 512;
constexpr int K = 320; constexpr int K = 320;
...@@ -30,7 +30,7 @@ TYPED_TEST(TestGemmSplitK_MK_NK, SmallM) ...@@ -30,7 +30,7 @@ TYPED_TEST(TestGemmSplitK_MK_NK, SmallM)
TYPED_TEST(TestGemmSplitK_KM_KN, SmallM) TYPED_TEST(TestGemmSplitK_KM_KN, SmallM)
{ {
std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6}; std::vector<int> Ms{1, 2, 3, 4, 5, 6};
constexpr int N = 512; constexpr int N = 512;
constexpr int K = 320; constexpr int K = 320;
...@@ -43,7 +43,7 @@ TYPED_TEST(TestGemmSplitK_KM_KN, SmallM) ...@@ -43,7 +43,7 @@ TYPED_TEST(TestGemmSplitK_KM_KN, SmallM)
TYPED_TEST(TestGemmSplitK_KM_NK, SmallM) TYPED_TEST(TestGemmSplitK_KM_NK, SmallM)
{ {
std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6}; std::vector<int> Ms{1, 2, 3, 4, 5, 6};
constexpr int N = 512; constexpr int N = 512;
constexpr int K = 320; constexpr int K = 320;
......
if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940") list(APPEND gpu_list_xdl gfx908 gfx90a gfx940)
add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data.cpp) list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102)
target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance) set(target 0)
add_gtest_executable(test_grouped_convnd_bwd_data_interface test_grouped_convnd_bwd_data_interface.cpp) foreach(gpu IN LISTS GPU_TARGETS)
target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance) if(gpu IN_LIST gpu_list_xdl AND target EQUAL 0)
endif() add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data.cpp)
\ No newline at end of file target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
add_gtest_executable(test_grouped_convnd_bwd_data_interface test_grouped_convnd_bwd_data_interface_xdl.cpp)
target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance)
set(target 1)
endif()
if(gpu IN_LIST gpu_list_wmma AND target EQUAL 0)
add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data.cpp)
target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
add_gtest_executable(test_grouped_convnd_bwd_data_interface test_grouped_convnd_bwd_data_interface_wmma.cpp)
target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance)
set(target 1)
endif()
endforeach()
\ No newline at end of file
...@@ -51,16 +51,20 @@ using namespace ck::tensor_layout::convolution; ...@@ -51,16 +51,20 @@ using namespace ck::tensor_layout::convolution;
using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWK, GKYXC, GNHWC>, using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWK, GKYXC, GNHWC>,
std::tuple<ck::half_t, GNHWK, GKYXC, GNHWC>, std::tuple<ck::half_t, GNHWK, GKYXC, GNHWC>,
std::tuple<ck::bhalf_t, GNHWK, GKYXC, GNHWC>, std::tuple<ck::bhalf_t, GNHWK, GKYXC, GNHWC>,
std::tuple<int8_t, GNHWK, GKYXC, GNHWC>,
std::tuple<float, NHWGK, GKYXC, NHWGC>, std::tuple<float, NHWGK, GKYXC, NHWGC>,
std::tuple<ck::half_t, NHWGK, GKYXC, NHWGC>, std::tuple<ck::half_t, NHWGK, GKYXC, NHWGC>,
std::tuple<ck::bhalf_t, NHWGK, GKYXC, NHWGC>>; std::tuple<ck::bhalf_t, NHWGK, GKYXC, NHWGC>,
std::tuple<int8_t, NHWGK, GKYXC, NHWGC>>;
using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWK, GKZYXC, GNDHWC>, using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWK, GKZYXC, GNDHWC>,
std::tuple<ck::half_t, GNDHWK, GKZYXC, GNDHWC>, std::tuple<ck::half_t, GNDHWK, GKZYXC, GNDHWC>,
std::tuple<ck::bhalf_t, GNDHWK, GKZYXC, GNDHWC>, std::tuple<ck::bhalf_t, GNDHWK, GKZYXC, GNDHWC>,
std::tuple<int8_t, GNDHWK, GKZYXC, GNDHWC>,
std::tuple<float, NDHWGK, GKZYXC, NDHWGC>, std::tuple<float, NDHWGK, GKZYXC, NDHWGC>,
std::tuple<ck::half_t, NDHWGK, GKZYXC, NDHWGC>, std::tuple<ck::half_t, NDHWGK, GKZYXC, NDHWGC>,
std::tuple<ck::bhalf_t, NDHWGK, GKZYXC, NDHWGC>>; std::tuple<ck::bhalf_t, NDHWGK, GKZYXC, NDHWGC>,
std::tuple<int8_t, NDHWGK, GKZYXC, NDHWGC>>;
template <typename Tuple> template <typename Tuple>
class TestGroupedConvndBwdData2d : public TestGroupedConvndBwdData<Tuple> class TestGroupedConvndBwdData2d : public TestGroupedConvndBwdData<Tuple>
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <tuple>
#include <vector>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include <gtest/gtest.h>
using DataType = ck::half_t;
using AccDataType = float;
using Pass = ck::tensor_operation::element_wise::PassThrough;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using ConvBackwardDataSpecialization =
ck::tensor_operation::device::ConvolutionBackwardDataSpecialization;
static constexpr auto ConvBwdDataDefault = ConvBackwardDataSpecialization::Default;
static constexpr auto Filter1x1Stride1Pad0 = ConvBackwardDataSpecialization::Filter1x1Stride1Pad0;
template <typename Tuple, ConvBackwardDataSpecialization ConvSpec>
class TestGroupedConvndBwdData : public ::testing::Test
{
protected:
static constexpr ck::index_t NDimSpatial = 2;
using OutLayout = std::tuple_element_t<0, Tuple>;
using WeiLayout = std::tuple_element_t<1, Tuple>;
using InLayout = std::tuple_element_t<2, Tuple>;
// clang-format off
using GroupedConvBwdDataDeviceInstance = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
//| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| ConvForward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//| | | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< NDimSpatial,OutLayout, WeiLayout, ck::Tuple<>, InLayout, DataType, DataType, AccDataType, DataType, ck::Tuple<>, DataType, Pass, Pass, Pass, ConvSpec, 64, 32, 64, 8, 8, 16, 16, 1, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<8, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 1, 1, 1, S<1, 32, 1, 2>, 8>;
// clang-format on
ck::utils::conv::ConvParam conv_param;
template <ck::index_t NDimSpatial>
bool Run()
{
const auto out_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
conv_param);
const auto wei_g_k_c_xs_desc =
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
conv_param);
const auto in_g_n_c_wis_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
conv_param);
std::array<ck::index_t, NDimSpatial + 3> out_lengths{};
std::array<ck::index_t, NDimSpatial + 3> out_strides{};
std::array<ck::index_t, NDimSpatial + 3> wei_lengths{};
std::array<ck::index_t, NDimSpatial + 3> wei_strides{};
std::array<ck::index_t, NDimSpatial + 3> in_lengths{};
std::array<ck::index_t, NDimSpatial + 3> in_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
copy(out_g_n_k_wos_desc.GetLengths(), out_lengths);
copy(out_g_n_k_wos_desc.GetStrides(), out_strides);
copy(wei_g_k_c_xs_desc.GetLengths(), wei_lengths);
copy(wei_g_k_c_xs_desc.GetStrides(), wei_strides);
copy(in_g_n_c_wis_desc.GetLengths(), in_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), in_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);
auto conv = GroupedConvBwdDataDeviceInstance{};
auto argument = conv.MakeArgument(nullptr,
nullptr,
std::array<const void*, 0>{},
nullptr,
out_lengths,
out_strides,
wei_lengths,
wei_strides,
{},
{},
in_lengths,
in_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
Pass{},
Pass{},
Pass{});
return conv.IsSupportedArgument(argument);
}
};
using GNHWC = ck::tensor_layout::convolution::GNHWC;
using NHWGC = ck::tensor_layout::convolution::NHWGC;
using GKYXC = ck::tensor_layout::convolution::GKYXC;
using GNHWK = ck::tensor_layout::convolution::GNHWK;
using NHWGK = ck::tensor_layout::convolution::NHWGK;
using KernelTypes =
::testing::Types<std::tuple<GNHWK, GKYXC, GNHWC>, std::tuple<NHWGK, GKYXC, NHWGC>>;
template <typename Tuple>
class TestGroupedConvndBwdDataDefault : public TestGroupedConvndBwdData<Tuple, ConvBwdDataDefault>
{
};
template <typename Tuple>
class TestGroupedConvndBwdDataFilter1x1
: public TestGroupedConvndBwdData<Tuple, Filter1x1Stride1Pad0>
{
};
TYPED_TEST_SUITE(TestGroupedConvndBwdDataDefault, KernelTypes);
TYPED_TEST_SUITE(TestGroupedConvndBwdDataFilter1x1, KernelTypes);
TYPED_TEST(TestGroupedConvndBwdDataFilter1x1, SpecializationCheck)
{
// Check filter 3,3 instead of 1,1
this->conv_param = {2, 2, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
bool is_supported = this->template Run<2>();
EXPECT_FALSE(is_supported);
// Check strides 2,2 instead of 1,1
this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
is_supported = this->template Run<2>();
EXPECT_FALSE(is_supported);
// Check with pad
this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}};
is_supported = this->template Run<2>();
EXPECT_FALSE(is_supported);
// Supported version
this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
is_supported = this->template Run<2>();
EXPECT_TRUE(is_supported);
}
TYPED_TEST(TestGroupedConvndBwdDataDefault, VectorLoadCheck)
{
// vector load for A
this->conv_param = {2, 2, 128, 129, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
bool is_supported = this->template Run<2>();
EXPECT_FALSE(is_supported);
// vector load for B, E, Ds
this->conv_param = {2, 2, 128, 128, 257, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
is_supported = this->template Run<2>();
EXPECT_FALSE(is_supported);
}
add_gtest_executable(test_grouped_convnd_fwd grouped_convnd_fwd.cpp) add_gtest_executable(test_grouped_convnd_fwd test_grouped_convnd_fwd.cpp)
target_link_libraries(test_grouped_convnd_fwd PRIVATE utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance) target_link_libraries(test_grouped_convnd_fwd PRIVATE utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment