Commit 316d4acc authored by Adam Osewski's avatar Adam Osewski
Browse files

Merge remote-tracking branch 'origin/develop' into aosewski/gemm_tile_loop

parents 9836e0ae 37a8c1f7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_image_to_column_impl.hpp"
#include "profiler_operation_registry.hpp"
namespace {
enum struct ConvLayout
{
NHWC, // 0
};
enum struct DataType
{
F32_F32, // 0
F16_F16, // 1
BF16_BF16, // 2
INT8_INT8, // 3
};
#define OP_NAME "image_to_column"
#define OP_DESC "Image To Column"
static void print_helper_msg()
{
std::cout
// clang-format off
<< "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
<< " 1: Input fp16, Weight fp16, Output fp16\n"
<< " 2: Input bf16, Weight bf16, Output bf16\n"
<< " 3: Input int8, Weight int8, Output int8)\n"
<< "arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])\n"
<< "arg4: verification (0: no, 1: yes)\n"
<< "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
// clang-format on
}
} // namespace
int profile_image_to_column(int argc, char* argv[])
{
// 8 for control, 1 for num_dim_spatial
if(argc < 9)
{
print_helper_msg();
return 1;
}
const auto data_type = static_cast<DataType>(std::stoi(argv[2]));
const auto layout = static_cast<ConvLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int num_dim_spatial = std::stoi(argv[8]);
// 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
if(argc != 8 + 1 + 4 + 6 * num_dim_spatial)
{
print_helper_msg();
return 1;
}
const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
using F32 = float;
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using INT8 = int8_t;
using namespace ck::tensor_layout::convolution;
constexpr auto I1 = ck::Number<1>{};
constexpr auto I2 = ck::Number<2>{};
constexpr auto I3 = ck::Number<3>{};
auto profile = [&](auto num_dim_spatial_tmp, auto in_layout, auto in_type, auto out_type) {
constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
using InLayout = decltype(in_layout);
using InDataType = decltype(in_type);
using OutDataType = decltype(out_type);
bool pass = ck::profiler::
profile_image_to_column_impl<NDimSpatial, InLayout, InDataType, OutDataType>(
do_verification, init_method, do_log, time_kernel, params);
return pass ? 0 : 1;
};
// NHWC
if(layout == ConvLayout::NHWC)
{
if(num_dim_spatial == 1)
{
if(data_type == DataType::F32_F32)
{
return profile(I1, GNWC{}, F32{}, F32{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I1, GNWC{}, F16{}, F16{});
}
else if(data_type == DataType::BF16_BF16)
{
return profile(I1, GNWC{}, BF16{}, BF16{});
}
else if(data_type == DataType::INT8_INT8)
{
return profile(I1, GNWC{}, INT8{}, INT8{});
}
}
else if(num_dim_spatial == 2)
{
if(data_type == DataType::F32_F32)
{
return profile(I2, GNHWC{}, F32{}, F32{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I2, GNHWC{}, F16{}, F16{});
}
else if(data_type == DataType::BF16_BF16)
{
return profile(I2, GNHWC{}, BF16{}, BF16{});
}
else if(data_type == DataType::INT8_INT8)
{
return profile(I2, GNHWC{}, INT8{}, INT8{});
}
}
else if(num_dim_spatial == 3)
{
if(data_type == DataType::F32_F32)
{
return profile(I3, GNDHWC{}, F32{}, F32{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I3, GNDHWC{}, F16{}, F16{});
}
else if(data_type == DataType::BF16_BF16)
{
return profile(I3, GNDHWC{}, BF16{}, BF16{});
}
else if(data_type == DataType::INT8_INT8)
{
return profile(I3, GNDHWC{}, INT8{}, INT8{});
}
}
}
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_image_to_column);
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <unordered_map>
#include "profiler/data_type_enum.hpp"
#include "profiler/profile_max_pool3d_bwd_impl.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "profiler_operation_registry.hpp"
using ck::index_t;
struct maxPoolbwdArgParser
{
std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
{"wsize", {}},
{"wstride", {}},
{"wdilation", {}},
{"pad1", {}},
{"pad2", {}}};
bool parse_opt(int argc, char* argv[], const std::string& key, int i)
{
if(std::string("--") + key == argv[i])
{
int pos = i;
while(++i < argc && argv[i][0] != '-') {}
int end = i;
for(int j = pos + 1; j < end; j++)
{
long_opts[key].push_back(std::stoi(argv[j]));
}
return true;
}
return false;
}
void operator()(int argc, char* argv[])
{
for(auto& kv : long_opts)
{
for(int i = 1; i < argc; i++)
{
if(parse_opt(argc, argv, kv.first, i))
break;
}
}
}
};
void print_help_max_pool3d_bwd()
{
std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
<< "arg2: verification (0: no; 1: yes)\n"
<< "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg4: print tensor value (0: no; 1: yes)\n"
<< "arg5: time kernel (0=no, 1=yes)\n"
<< "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
<< "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
<< "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
<< "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
<< "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
<< "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
<< "eg: ckProfiler max_pool3d_bwd 0 1 2 0 1 --length 2 32 30 30 30 --wsize 2 2 2 "
"--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
<< std::endl;
}
int profile_max_pool3d_bwd(int argc, char* argv[])
{
ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
bool do_verification = true;
int init_method = 0;
bool do_log = false;
bool time_kernel = true;
std::vector<index_t> in_length = {2, 32, 30, 30, 30};
std::vector<index_t> wsize = {2, 2, 2};
std::vector<index_t> wstride = {2, 2, 2};
std::vector<index_t> wdilation = {1, 1, 1};
std::vector<index_t> pad1 = {1, 1, 1};
std::vector<index_t> pad2 = {1, 1, 1};
if(argc != 2 && argc != 33)
{
print_help_max_pool3d_bwd();
return 0;
}
else if(argc == 33)
{
data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
do_verification = std::stoi(argv[3]);
init_method = std::stoi(argv[4]);
do_log = std::stoi(argv[5]);
time_kernel = std::stoi(argv[6]);
// parse the long options
maxPoolbwdArgParser arg_parser;
arg_parser(argc, argv);
in_length = arg_parser.long_opts["length"];
wsize = arg_parser.long_opts["wsize"];
wstride = arg_parser.long_opts["wstride"];
wdilation = arg_parser.long_opts["wdilation"];
pad1 = arg_parser.long_opts["pad1"];
pad2 = arg_parser.long_opts["pad2"];
}
#ifdef CK_ENABLE_FP16
using F16 = ck::half_t;
#endif
#ifdef CK_ENABLE_BF16
using BF16 = ck::bhalf_t;
#endif
#ifdef CK_ENABLE_FP32
using F32 = float;
#endif
using I32 = int32_t;
if(false)
;
#ifdef CK_ENABLE_FP16
else if(data_type == ck::DataTypeEnum::Half)
{
ck::profiler::profile_max_pool3d_bwd_impl<F16, F16, I32, F16, F16, false>(do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
}
#endif
#ifdef CK_ENABLE_BF16
else if(data_type == ck::DataTypeEnum::BFloat16)
{
ck::profiler::profile_max_pool3d_bwd_impl<BF16, BF16, I32, BF16, BF16, false>(
do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
}
#endif
#ifdef CK_ENABLE_FP32
else if(data_type == ck::DataTypeEnum::Float)
{
ck::profiler::profile_max_pool3d_bwd_impl<F32, F32, I32, F32, F32, false>(do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
}
#endif
else
{
throw std::runtime_error("not implemented yet");
}
return 0;
}
REGISTER_PROFILER_OPERATION("max_pool3d_bwd", "max_pool3d bwd", profile_max_pool3d_bwd);
...@@ -51,7 +51,7 @@ struct maxPoolFwdArgParser ...@@ -51,7 +51,7 @@ struct maxPoolFwdArgParser
void print_help_max_pool3d_fwd() void print_help_max_pool3d_fwd()
{ {
std::cout << "arg1: data type (0: fp16; 1: fp32)\n" std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
<< "arg2: verification (0: no; 1: yes)\n" << "arg2: verification (0: no; 1: yes)\n"
<< "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n" << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg4: print tensor value (0: no; 1: yes)\n" << "arg4: print tensor value (0: no; 1: yes)\n"
...@@ -109,8 +109,15 @@ int profile_max_pool3d_fwd(int argc, char* argv[]) ...@@ -109,8 +109,15 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
pad2 = arg_parser.long_opts["pad2"]; pad2 = arg_parser.long_opts["pad2"];
} }
using F16 = ck::half_t; #ifdef CK_ENABLE_FP16
using F32 = float; using F16 = ck::half_t;
#endif
#ifdef CK_ENABLE_BF16
using BF16 = ck::bhalf_t;
#endif
#ifdef CK_ENABLE_FP32
using F32 = float;
#endif
using I32 = int32_t; using I32 = int32_t;
using NDHWC = ck::tensor_layout::convolution::NDHWC; using NDHWC = ck::tensor_layout::convolution::NDHWC;
...@@ -120,7 +127,10 @@ int profile_max_pool3d_fwd(int argc, char* argv[]) ...@@ -120,7 +127,10 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
#endif #endif
if(data_type == ck::DataTypeEnum::Half) if(false)
;
#ifdef CK_ENABLE_FP16
else if(data_type == ck::DataTypeEnum::Half)
{ {
if(return_index) if(return_index)
ck::profiler:: ck::profiler::
...@@ -149,6 +159,51 @@ int profile_max_pool3d_fwd(int argc, char* argv[]) ...@@ -149,6 +159,51 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
pad1, pad1,
pad2); pad2);
} }
#endif
#ifdef CK_ENABLE_BF16
else if(data_type == ck::DataTypeEnum::BFloat16)
{
if(return_index)
ck::profiler::profile_pool3d_fwd_impl<BF16,
BF16,
BF16,
I32,
NDHWC,
NDHWC,
ReduceOpId,
false,
true>(do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
else
ck::profiler::profile_pool3d_fwd_impl<BF16,
BF16,
BF16,
I32,
NDHWC,
NDHWC,
ReduceOpId,
false,
false>(do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
}
#endif
#ifdef CK_ENABLE_FP32
else if(data_type == ck::DataTypeEnum::Float) else if(data_type == ck::DataTypeEnum::Float)
{ {
if(return_index) if(return_index)
...@@ -178,6 +233,7 @@ int profile_max_pool3d_fwd(int argc, char* argv[]) ...@@ -178,6 +233,7 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
pad1, pad1,
pad2); pad2);
} }
#endif
else else
{ {
throw std::runtime_error("not implemented yet"); throw std::runtime_error("not implemented yet");
......
#find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' #find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}'
git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}'
...@@ -16,4 +16,3 @@ cmake ...@@ -16,4 +16,3 @@ cmake
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D USE_BITINT_EXTENSION_INT4=OFF \ -D USE_BITINT_EXTENSION_INT4=OFF \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
...@@ -11,7 +11,7 @@ run_and_check() { ...@@ -11,7 +11,7 @@ run_and_check() {
} }
echo "I: Installing tools required for pre-commit checks..." echo "I: Installing tools required for pre-commit checks..."
run_and_check apt install clang-format-10 run_and_check apt install clang-format-12
echo "I: Installing pre-commit itself..." echo "I: Installing pre-commit itself..."
run_and_check pip3 install pre-commit run_and_check pip3 install pre-commit
......
...@@ -57,9 +57,10 @@ add_subdirectory(data_type) ...@@ -57,9 +57,10 @@ add_subdirectory(data_type)
add_subdirectory(elementwise_normalization) add_subdirectory(elementwise_normalization)
add_subdirectory(batchnorm) add_subdirectory(batchnorm)
add_subdirectory(contraction) add_subdirectory(contraction)
add_subdirectory(pool_fwd) add_subdirectory(pool)
add_subdirectory(batched_gemm_multi_d) add_subdirectory(batched_gemm_multi_d)
add_subdirectory(grouped_convnd_bwd_data) add_subdirectory(grouped_convnd_bwd_data)
add_subdirectory(image_to_column)
if(GPU_TARGETS MATCHES "gfx11") if(GPU_TARGETS MATCHES "gfx11")
add_subdirectory(wmma_op) add_subdirectory(wmma_op)
endif() endif()
...@@ -71,6 +71,6 @@ TYPED_TEST_SUITE(TestBatchedGemmMultiD, KernelTypes); ...@@ -71,6 +71,6 @@ TYPED_TEST_SUITE(TestBatchedGemmMultiD, KernelTypes);
#ifdef __fp16 #ifdef __fp16
TYPED_TEST(TestBatchedGemmMultiD, f16) { this->template Run<F16>(); } TYPED_TEST(TestBatchedGemmMultiD, f16) { this->template Run<F16>(); }
#endif #endif
#ifdef __int8__ #ifdef CK_ENABLE_INT8
TYPED_TEST(TestBatchedGemmMultiD, int8) { this->template Run<int8_t>(); } TYPED_TEST(TestBatchedGemmMultiD, int8) { this->template Run<int8_t>(); }
#endif #endif
add_gtest_executable(test_image_to_column test_image_to_column.cpp)
target_link_libraries(test_image_to_column PRIVATE utility device_image_to_column_instance)
add_gtest_executable(test_image_to_column_interface test_image_to_column_interface.cpp)
target_link_libraries(test_image_to_column_interface PRIVATE utility)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <tuple>
#include <vector>
#include <gtest/gtest.h>
#include "profiler/profile_image_to_column_impl.hpp"
template <typename Tuple>
class TestImageToColumn : public ::testing::Test
{
protected:
using InDataType = std::tuple_element_t<0, Tuple>;
using OutDataType = std::tuple_element_t<1, Tuple>;
using InLayout = std::tuple_element_t<2, Tuple>;
std::vector<ck::utils::conv::ConvParam> conv_params;
template <ck::index_t NDimSpatial>
void Run()
{
EXPECT_FALSE(conv_params.empty());
bool pass = true;
for(auto& param : conv_params)
{
pass = pass && ck::profiler::profile_image_to_column_impl<NDimSpatial,
InLayout,
InDataType,
OutDataType>(
true, // do_verification
1, // init_method: integer value
false, // do_log
false, // time_kernel
param);
}
EXPECT_TRUE(pass);
}
};
using namespace ck::tensor_layout::convolution;
using KernelTypes1d = ::testing::Types<std::tuple<float, float, GNWC>,
std::tuple<ck::bhalf_t, ck::bhalf_t, GNWC>,
std::tuple<ck::half_t, ck::half_t, GNWC>,
std::tuple<int8_t, int8_t, GNWC>>;
using KernelTypes2d = ::testing::Types<std::tuple<float, float, GNHWC>,
std::tuple<ck::bhalf_t, ck::bhalf_t, GNHWC>,
std::tuple<ck::half_t, ck::half_t, GNHWC>,
std::tuple<int8_t, int8_t, GNHWC>>;
using KernelTypes3d = ::testing::Types<std::tuple<float, float, GNDHWC>,
std::tuple<ck::bhalf_t, ck::bhalf_t, GNDHWC>,
std::tuple<ck::half_t, ck::half_t, GNDHWC>,
std::tuple<int8_t, int8_t, GNDHWC>>;
template <typename Tuple>
class TestImageToColumn1d : public TestImageToColumn<Tuple>
{
};
template <typename Tuple>
class TestImageToColumn2d : public TestImageToColumn<Tuple>
{
};
template <typename Tuple>
class TestImageToColumn3d : public TestImageToColumn<Tuple>
{
};
TYPED_TEST_SUITE(TestImageToColumn1d, KernelTypes1d);
TYPED_TEST_SUITE(TestImageToColumn2d, KernelTypes2d);
TYPED_TEST_SUITE(TestImageToColumn3d, KernelTypes3d);
TYPED_TEST(TestImageToColumn1d, Test1D)
{
this->conv_params.clear();
this->conv_params.push_back({1, 1, 4, 1, 192, {3}, {28}, {1}, {1}, {1}, {1}});
this->conv_params.push_back({1, 1, 64, 1, 64, {3}, {14}, {1}, {1}, {1}, {1}});
this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {7}, {2}, {1}, {0}, {0}});
this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {3}, {1}, {1}, {0}, {0}});
// ScalarPerVector should be 1
this->conv_params.push_back({1, 1, 4, 1, 1, {3}, {28}, {1}, {1}, {1}, {1}});
// stride != 1
this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {2}, {1}, {1}, {1}});
// dilation != 1
this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {1}, {2}, {1}, {1}});
this->template Run<1>();
}
TYPED_TEST(TestImageToColumn2d, Test2D)
{
this->conv_params.clear();
this->conv_params.push_back(
{2, 1, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back(
{2, 1, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
this->template Run<2>();
}
TYPED_TEST(TestImageToColumn3d, Test3D)
{
this->conv_params.clear();
this->conv_params.push_back(
{3, 1, 16, 1, 64, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
this->conv_params.push_back(
{3, 1, 2, 1, 64, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back(
{3, 1, 32, 1, 64, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
this->template Run<3>();
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <tuple>
#include <vector>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include <gtest/gtest.h>
using DataType = float;
using InLayout = ck::tensor_layout::convolution::GNWC;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
template <ck::index_t ScalarPerVector, bool IsCPacked>
class TestImageToColumnInterface : public ::testing::Test
{
protected:
static constexpr ck::index_t NDimSpatial = 1;
// clang-format off
using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| | | | | | | | | |
< NDimSpatial, InLayout, DataType, DataType, 256, 128, 128, S<16, 16>,ScalarPerVector>;
// clang-format on
ck::utils::conv::ConvParam conv_param;
bool Run()
{
const auto N = conv_param.N_;
const auto C = conv_param.C_;
const auto FakeC =
conv_param.C_ / 2; // Fake C to simulate the behavior that C is not packed
const ck::index_t NDoHoWo =
N *
ck::accumulate_n<ck::index_t>(
conv_param.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
const ck::index_t CZYX =
C *
ck::accumulate_n<ck::index_t>(
conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
const auto in_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
conv_param);
const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX});
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
std::array<ck::index_t, 2> output_m_k_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
copy(conv_param.input_spatial_lengths_, input_spatial_lengths);
copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths);
copy(conv_param.output_spatial_lengths_, output_spatial_lengths);
copy(in_desc.GetStrides(), input_g_n_c_wis_strides);
copy(out_desc.GetStrides(), output_m_k_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);
auto img2col = DeviceImgToColInstance{};
auto argument = img2col.MakeArgument(nullptr,
nullptr,
N,
IsCPacked ? C : FakeC,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
input_g_n_c_wis_strides,
output_m_k_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
return img2col.IsSupportedArgument(argument);
}
};
class TestImageToColumnInterface1ScalarPerVector : public TestImageToColumnInterface<1, true>
{
};
class TestImageToColumnInterface4ScalarPerVector : public TestImageToColumnInterface<4, true>
{
};
class TestImageToColumnInterface4ScalarPerVectorFakeC : public TestImageToColumnInterface<4, false>
{
};
TEST_F(TestImageToColumnInterface1ScalarPerVector, X1ScalarPerVector)
{
// vector load C * X % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}};
bool is_supported = this->Run();
EXPECT_TRUE(is_supported);
// vector load C * left_pad_x % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}};
is_supported = this->Run();
EXPECT_TRUE(is_supported);
// vector load C * right_pad_x % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}};
is_supported = this->Run();
EXPECT_TRUE(is_supported);
// vector load C % ScalarPerVector, right_pad and stride
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}};
is_supported = this->Run();
EXPECT_TRUE(is_supported);
// vector load C % ScalarPerVector, left_pad and stride
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}};
is_supported = this->Run();
EXPECT_TRUE(is_supported);
// vector load C % ScalarPerVector, dilation
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
is_supported = this->Run();
EXPECT_TRUE(is_supported);
// C = 4
this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}};
is_supported = this->Run();
EXPECT_TRUE(is_supported);
}
TEST_F(TestImageToColumnInterface4ScalarPerVector, X4ScalarPerVector)
{
// vector load C * X % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}};
bool is_supported = this->Run();
EXPECT_FALSE(is_supported);
// vector load C * left_pad_x % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}};
is_supported = this->Run();
EXPECT_FALSE(is_supported);
// vector load C * right_pad_x % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}};
is_supported = this->Run();
EXPECT_FALSE(is_supported);
// vector load C % ScalarPerVector, right_pad and stride
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}};
is_supported = this->Run();
EXPECT_FALSE(is_supported);
// vector load C % ScalarPerVector, left_pad and stride
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}};
is_supported = this->Run();
EXPECT_FALSE(is_supported);
// vector load C % ScalarPerVector, dilation
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
is_supported = this->Run();
EXPECT_FALSE(is_supported);
// C = 4
this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}};
is_supported = this->Run();
EXPECT_TRUE(is_supported);
}
TEST_F(TestImageToColumnInterface4ScalarPerVectorFakeC, X4ScalarPerVectorFakeC)
{
// C = 3
this->conv_param = {1, 1, 1, 1, 3, {4}, {3}, {1}, {1}, {0}, {0}};
bool is_supported = this->Run();
EXPECT_FALSE(is_supported);
// C = 4
this->conv_param = {1, 1, 1, 1, 8, {4}, {3}, {1}, {1}, {0}, {0}};
is_supported = this->Run();
EXPECT_TRUE(is_supported);
}
add_custom_target(test_pool_fwd) add_custom_target(test_pool)
add_gtest_executable(test_avg_pool3d_bwd test_avg_pool3d_bwd.cpp)
add_gtest_executable(test_max_pool3d_bwd test_max_pool3d_bwd.cpp)
add_gtest_executable(test_avg_pool3d_fwd test_avg_pool3d_fwd.cpp) add_gtest_executable(test_avg_pool3d_fwd test_avg_pool3d_fwd.cpp)
add_gtest_executable(test_max_pool3d_fwd test_max_pool3d_fwd.cpp) add_gtest_executable(test_max_pool3d_fwd test_max_pool3d_fwd.cpp)
target_link_libraries(test_avg_pool3d_bwd PRIVATE utility device_avg_pool3d_bwd_instance)
target_link_libraries(test_max_pool3d_bwd PRIVATE utility device_max_pool_bwd_instance)
target_link_libraries(test_avg_pool3d_fwd PRIVATE utility device_pool3d_fwd_instance) target_link_libraries(test_avg_pool3d_fwd PRIVATE utility device_pool3d_fwd_instance)
target_link_libraries(test_max_pool3d_fwd PRIVATE utility device_pool3d_fwd_instance) target_link_libraries(test_max_pool3d_fwd PRIVATE utility device_pool3d_fwd_instance)
add_dependencies(test_pool_fwd test_avg_pool3d_fwd) add_dependencies(test_pool test_avg_pool3d_bwd)
add_dependencies(test_pool_fwd test_max_pool3d_fwd) add_dependencies(test_pool test_max_pool3d_bwd)
add_dependencies(test_pool test_avg_pool3d_fwd)
add_dependencies(test_pool test_max_pool3d_fwd)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_avg_pool3d_bwd_impl.hpp"
#include "test_pool_fwd_common.hpp"
template <typename Tuple>
class TestAvgPool3dBwd : public ::testing::Test
{
protected:
using DOutDataType = std::tuple_element_t<0, Tuple>;
using DInDataType = std::tuple_element_t<1, Tuple>;
using ComputeDataType = std::tuple_element_t<2, Tuple>;
using DOutLayout = std::tuple_element_t<3, Tuple>;
using DInLayout = std::tuple_element_t<4, Tuple>;
std::vector<PoolingParam> params;
void Run()
{
for(auto param : params)
{
bool success =
ck::profiler::profile_avg_pool3d_bwd_impl<DOutDataType,
DInDataType,
ComputeDataType,
DOutLayout,
DInLayout>(true,
2,
false,
false,
param.length_,
param.window_spatial_lengths_,
param.window_strides_,
param.window_dilations_,
param.input_left_pads_,
param.input_right_pads_);
EXPECT_TRUE(success);
}
}
};
#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP32)
using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, NDHWC, NDHWC>,
std::tuple<BF16, BF16, F32, NDHWC, NDHWC>,
std::tuple<F32, F32, F32, NDHWC, NDHWC>>;
#elif defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP32)
using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, NDHWC, NDHWC>,
std::tuple<F32, F32, F32, NDHWC, NDHWC>>;
#elif defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP32)
using KernelTypes = ::testing::Types<std::tuple<BF16, BF16, F32, NDHWC, NDHWC>,
std::tuple<F32, F32, F32, NDHWC, NDHWC>>;
#elif defined(CK_ENABLE_FP16) && defined(CK_ENABLE_BF16)
using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, NDHWC, NDHWC>,
std::tuple<BF16, BF16, F32, NDHWC, NDHWC>>;
#elif defined(CK_ENABLE_FP16)
using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, NDHWC, NDHWC>>;
#elif defined(CK_ENABLE_BF16)
using KernelTypes = ::testing::Types<std::tuple<BF16, BF16, F32, NDHWC, NDHWC>>;
#elif defined(CK_ENABLE_FP32)
using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, NDHWC, NDHWC>>;
#endif
TYPED_TEST_SUITE(TestAvgPool3dBwd, KernelTypes);
TYPED_TEST(TestAvgPool3dBwd, Test_Pool)
{
// length, window_length, window_stride, window_dilation, left_pad, right_pad
this->params = {{{1, 1, 1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
{{2, 16, 64, 64, 64}, {4, 4, 4}, {4, 4, 4}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}},
{{2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}};
this->Run();
}
...@@ -43,7 +43,7 @@ class TestAvgPool3dFwd : public ::testing::Test ...@@ -43,7 +43,7 @@ class TestAvgPool3dFwd : public ::testing::Test
} }
} }
}; };
#ifdef __fp16__ #ifdef CK_ENABLE_FP16
using KernelTypes = using KernelTypes =
::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>; ::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
#else #else
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_max_pool3d_bwd_impl.hpp"
#include "test_pool_fwd_common.hpp"
template <typename Tuple>
class TestMaxPool3dBwd : public ::testing::Test
{
protected:
using DOutDataType = std::tuple_element_t<0, Tuple>;
using DInDataType = std::tuple_element_t<1, Tuple>;
using IndexDataType = std::tuple_element_t<2, Tuple>;
using InDataType = DInDataType;
using OutDataType = DOutDataType;
std::vector<PoolingParam> params;
void Run()
{
for(auto param : params)
{
bool success =
ck::profiler::profile_max_pool3d_bwd_impl<InDataType,
OutDataType,
IndexDataType,
DOutDataType,
DInDataType,
false>(true,
2,
false,
false,
param.length_,
param.window_spatial_lengths_,
param.window_strides_,
param.window_dilations_,
param.input_left_pads_,
param.input_right_pads_);
EXPECT_TRUE(success);
}
}
};
#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP32)
using KernelTypes = ::testing::Types<std::tuple<F16, F16, I32, NDHWC, NDHWC>,
std::tuple<BF16, BF16, I32, NDHWC, NDHWC>,
std::tuple<F32, F32, I32, NDHWC, NDHWC>>;
#elif defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP32)
using KernelTypes = ::testing::Types<std::tuple<F16, F16, I32, NDHWC, NDHWC>,
std::tuple<F32, F32, I32, NDHWC, NDHWC>>;
#elif defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP32)
using KernelTypes = ::testing::Types<std::tuple<BF16, BF16, I32, NDHWC, NDHWC>,
std::tuple<F32, F32, I32, NDHWC, NDHWC>>;
#elif defined(CK_ENABLE_FP16) && defined(CK_ENABLE_BF16)
using KernelTypes = ::testing::Types<std::tuple<F16, F16, I32, NDHWC, NDHWC>,
std::tuple<BF16, BF16, I32, NDHWC, NDHWC>>;
#elif defined(CK_ENABLE_FP16)
using KernelTypes = ::testing::Types<std::tuple<F16, F16, I32, NDHWC, NDHWC>>;
#elif defined(CK_ENABLE_BF16)
using KernelTypes = ::testing::Types<std::tuple<BF16, BF16, I32, NDHWC, NDHWC>>;
#elif defined(CK_ENABLE_FP32)
using KernelTypes = ::testing::Types<std::tuple<F32, F32, I32, NDHWC, NDHWC>>;
#endif
TYPED_TEST_SUITE(TestMaxPool3dBwd, KernelTypes);
TYPED_TEST(TestMaxPool3dBwd, Test_Pool)
{
// length, window_length, window_stride, window_dilation, left_pad, right_pad
this->params = {{{1, 1, 1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
{{2, 16, 64, 64, 64}, {4, 4, 4}, {4, 4, 4}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}},
{{2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}};
// this->params = {{{2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1,
// 1}}};
this->Run();
}
...@@ -66,7 +66,7 @@ class TestMaxPool3dFwd : public ::testing::Test ...@@ -66,7 +66,7 @@ class TestMaxPool3dFwd : public ::testing::Test
} }
}; };
#ifdef __fp16__ #ifdef CK_ENABLE_FP16
using KernelTypes = using KernelTypes =
::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>; ::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
#else #else
......
...@@ -4,10 +4,12 @@ ...@@ -4,10 +4,12 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "ck/ck.hpp" #include "ck/ck.hpp"
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using BF16 = ck::bhalf_t;
using I32 = int32_t; using F32 = float;
using I32 = int32_t;
using ck::index_t; using ck::index_t;
using NDHWC = ck::tensor_layout::convolution::NDHWC;
struct PoolingParam struct PoolingParam
{ {
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
template <ck::index_t N> template <ck::index_t N>
using I = ck::Number<N>; using I = ck::Number<N>;
#ifdef __fp16__ #ifdef CK_ENABLE_FP16
using F16 = ck::half_t; using F16 = ck::half_t;
#endif #endif
using F32 = float; using F32 = float;
...@@ -23,7 +23,7 @@ class TestSoftmax : public ck::TestSoftmax<Tuple> ...@@ -23,7 +23,7 @@ class TestSoftmax : public ck::TestSoftmax<Tuple>
// clang-format off // clang-format off
using KernelTypes = ::testing::Types< using KernelTypes = ::testing::Types<
// InDataType, AccDataType, OutDataType, Rank // InDataType, AccDataType, OutDataType, Rank
#ifdef __fp16__ #ifdef CK_ENABLE_FP16
std::tuple< F16, F32, F16, I<3>>, std::tuple< F16, F32, F16, I<3>>,
#endif #endif
std::tuple< F32, F32, F32, I<3>> std::tuple< F32, F32, F32, I<3>>
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
template <ck::index_t N> template <ck::index_t N>
using I = ck::Number<N>; using I = ck::Number<N>;
#ifdef __fp16__ #ifdef CK_ENABLE_FP16
using F16 = ck::half_t; using F16 = ck::half_t;
#endif #endif
using F32 = float; using F32 = float;
...@@ -23,7 +23,7 @@ class TestSoftmax : public ck::TestSoftmax<Tuple> ...@@ -23,7 +23,7 @@ class TestSoftmax : public ck::TestSoftmax<Tuple>
// clang-format off // clang-format off
using KernelTypes = ::testing::Types< using KernelTypes = ::testing::Types<
// InDataType, AccDataType, OutDataType, Rank // InDataType, AccDataType, OutDataType, Rank
#ifdef __fp16__ #ifdef CK_ENABLE_FP16
std::tuple< F16, F32, F16, I<4>>, std::tuple< F16, F32, F16, I<4>>,
#endif #endif
std::tuple< F32, F32, F32, I<4>> std::tuple< F32, F32, F32, I<4>>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment