Unverified Commit a793afc9 authored by aledudek's avatar aledudek Committed by GitHub
Browse files

Extend pool3d fwd avg, max operations by f8_t, int8_t types (#1483)



* Extend pool3d fwd avg, max operations by f8_t, int8_t types

* Pack MaxPool3dFwd params together

* Fix MaxPool3dFwd AVG instances

* Decrease verification precision for bf16

* Adjust tests + review changes

* Adjust threshold for F8

* Adjusted compute types for MAX op instances

* Fix ComputeDataType mismatch in tests and profiler for AVG

* Fix naming from max_pool3d_fwd to pool3d_fwd

* Adjust CMakeLists

---------
Co-authored-by: default avatarAdam Osewski <19374865+aosewski@users.noreply.github.com>
parent 8ec15e64
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......@@ -22,7 +22,7 @@ static constexpr auto WindowRank = 3;
static constexpr auto MaxOp = ck::ReduceTensorOp::MAX;
static constexpr auto AvgOp = ck::ReduceTensorOp::AVG;
#ifdef CK_ENABLE_FP16
// FP16
void add_device_pool3d_fwd_ndhwc_f16_instances(
std::vector<std::unique_ptr<
......@@ -36,8 +36,22 @@ void add_device_pool3d_fwd_ndhwc_f16_instances(
void add_device_pool3d_fwd_ndhwc_index_f16_instances(
std::vector<std::unique_ptr<
DevicePoolFwd<InOutRank, WindowRank, F16, F16, I32, NDHWC, NDHWC, MaxOp, true>>>&);
#endif
#ifdef CK_ENABLE_BF16
using F8 = ck::f8_t;
// F8
void add_device_pool3d_fwd_ndhwc_f8_instances(
std::vector<std::unique_ptr<
DevicePoolFwd<InOutRank, WindowRank, F8, F8, I32, NDHWC, NDHWC, MaxOp, false>>>&);
void add_device_pool3d_fwd_ndhwc_f8_instances(
std::vector<std::unique_ptr<
DevicePoolFwd<InOutRank, WindowRank, F8, F8, I32, NDHWC, NDHWC, AvgOp, false>>>&);
// FP8 - return index
void add_device_pool3d_fwd_ndhwc_index_f8_instances(
std::vector<std::unique_ptr<
DevicePoolFwd<InOutRank, WindowRank, F8, F8, I32, NDHWC, NDHWC, MaxOp, true>>>&);
// BF16
void add_device_pool3d_fwd_ndhwc_bf16_instances(
std::vector<std::unique_ptr<
......@@ -51,8 +65,7 @@ void add_device_pool3d_fwd_ndhwc_bf16_instances(
void add_device_pool3d_fwd_ndhwc_index_bf16_instances(
std::vector<std::unique_ptr<
DevicePoolFwd<InOutRank, WindowRank, BF16, BF16, I32, NDHWC, NDHWC, MaxOp, true>>>&);
#endif
#ifdef CK_ENABLE_FP32
// FP32
void add_device_pool3d_fwd_ndhwc_f32_instances(
std::vector<std::unique_ptr<
......@@ -66,7 +79,21 @@ void add_device_pool3d_fwd_ndhwc_f32_instances(
void add_device_pool3d_fwd_ndhwc_index_f32_instances(
std::vector<std::unique_ptr<
DevicePoolFwd<InOutRank, WindowRank, F32, F32, I32, NDHWC, NDHWC, MaxOp, true>>>&);
#endif
// I8
void add_device_pool3d_fwd_ndhwc_i8_instances(
std::vector<std::unique_ptr<
DevicePoolFwd<InOutRank, WindowRank, I8, I8, I32, NDHWC, NDHWC, MaxOp, false>>>&);
void add_device_pool3d_fwd_ndhwc_i8_instances(
std::vector<std::unique_ptr<
DevicePoolFwd<InOutRank, WindowRank, I8, I8, I32, NDHWC, NDHWC, AvgOp, false>>>&);
// I8 - return index
void add_device_pool3d_fwd_ndhwc_index_i8_instances(
std::vector<std::unique_ptr<
DevicePoolFwd<InOutRank, WindowRank, I8, I8, I32, NDHWC, NDHWC, MaxOp, true>>>&);
template <typename InDataType,
typename OutDataType,
typename IndexDataType,
......@@ -99,7 +126,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
if constexpr(is_same_v<InLayout, NDHWC> && is_same_v<OutLayout, NDHWC>)
{
#ifdef CK_ENABLE_FP16
if constexpr(is_same_v<InDataType, F16> && is_same_v<OutDataType, F16> &&
is_same_v<IndexDataType, I32>)
{
......@@ -112,8 +138,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
add_device_pool3d_fwd_ndhwc_f16_instances(op_ptrs);
}
}
#endif
#ifdef CK_ENABLE_BF16
else if constexpr(is_same_v<InDataType, BF16> && is_same_v<OutDataType, BF16> &&
is_same_v<IndexDataType, I32>)
{
......@@ -126,8 +150,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
add_device_pool3d_fwd_ndhwc_bf16_instances(op_ptrs);
}
}
#endif
#ifdef CK_ENABLE_FP32
else if constexpr(is_same_v<InDataType, F32> && is_same_v<OutDataType, F32> &&
is_same_v<IndexDataType, I32>)
{
......@@ -140,7 +162,30 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
add_device_pool3d_fwd_ndhwc_f32_instances(op_ptrs);
}
}
#endif
else if constexpr(is_same_v<InDataType, F8> && is_same_v<OutDataType, F8> &&
is_same_v<IndexDataType, I32>)
{
if constexpr(OutputIndex && ReduceOpId == MaxOp)
{
add_device_pool3d_fwd_ndhwc_index_f8_instances(op_ptrs);
}
else
{
add_device_pool3d_fwd_ndhwc_f8_instances(op_ptrs);
}
}
else if constexpr(is_same_v<InDataType, I8> && is_same_v<OutDataType, I8> &&
is_same_v<IndexDataType, I32>)
{
if constexpr(OutputIndex && ReduceOpId == MaxOp)
{
add_device_pool3d_fwd_ndhwc_index_i8_instances(op_ptrs);
}
else
{
add_device_pool3d_fwd_ndhwc_i8_instances(op_ptrs);
}
}
}
return op_ptrs;
......
set(DEVICE_POOL3D_FWD_INSTANCES)
list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
device_max_pool3d_fwd_ndhwc_f16_instance.cpp
device_max_pool3d_fwd_ndhwc_f8_instance.cpp
device_avg_pool3d_fwd_ndhwc_f8_instance.cpp
device_max_pool3d_fwd_ndhwc_i8_instance.cpp
device_avg_pool3d_fwd_ndhwc_i8_instance.cpp
device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
device_max_pool3d_fwd_ndhwc_f32_instance.cpp
device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "pool_fwd_instance_common.hpp"
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "pool_fwd_instance_common.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
void add_device_pool3d_fwd_ndhwc_f8_instances(
std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F8, F8, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
instances)
{
add_device_operation_instances(
instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F32, ReduceOpId, false>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "pool_fwd_instance_common.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
void add_device_pool3d_fwd_ndhwc_i8_instances(
std::vector<std::unique_ptr<DevicePoolFwd<5, 3, I8, I8, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
instances)
{
add_device_operation_instances(
instances, device_pool3d_fwd_ndhwc_instances<I8, I8, I32, I32, ReduceOpId, false>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "pool_fwd_instance_common.hpp"
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "pool_fwd_instance_common.hpp"
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "pool_fwd_instance_common.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
void add_device_pool3d_fwd_ndhwc_f8_instances(
std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F8, F8, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
instances)
{
add_device_operation_instances(
instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F8, ReduceOpId, false>{});
}
void add_device_pool3d_fwd_ndhwc_index_f8_instances(
std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F8, F8, I32, NDHWC, NDHWC, ReduceOpId, true>>>&
instances)
{
add_device_operation_instances(
instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F8, ReduceOpId, true>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "pool_fwd_instance_common.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
void add_device_pool3d_fwd_ndhwc_i8_instances(
std::vector<std::unique_ptr<DevicePoolFwd<5, 3, I8, I8, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
instances)
{
add_device_operation_instances(
instances, device_pool3d_fwd_ndhwc_instances<I8, I8, I32, I8, ReduceOpId, false>{});
}
void add_device_pool3d_fwd_ndhwc_index_i8_instances(
std::vector<std::unique_ptr<DevicePoolFwd<5, 3, I8, I8, I32, NDHWC, NDHWC, ReduceOpId, true>>>&
instances)
{
add_device_operation_instances(
instances, device_pool3d_fwd_ndhwc_instances<I8, I8, I32, I8, ReduceOpId, true>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......@@ -15,6 +15,8 @@ namespace tensor_operation {
namespace device {
namespace instance {
using I8 = int8_t;
using F8 = ck::f8_t;
using I32 = int32_t;
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......@@ -17,6 +17,26 @@
namespace ck {
namespace profiler {
struct PoolFwdInputParams
{
int do_verification;
int init_method;
bool do_log;
bool time_kernel;
bool return_index;
int reduce_op;
};
struct PoolFwdKernelParams
{
std::vector<index_t> in_length; // NCDHW
std::vector<index_t> window_spatial_lengths;
std::vector<index_t> window_strides;
std::vector<index_t> window_dilations;
std::vector<index_t> input_left_pads;
std::vector<index_t> input_right_pads;
};
template <typename InDataType,
typename OutDataType,
typename ComputeDataType,
......@@ -26,29 +46,23 @@ template <typename InDataType,
ck::ReduceTensorOp ReduceOpId,
bool PropagateNan,
bool OutputIndex>
bool profile_pool3d_fwd_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
std::vector<index_t> in_length, // NCDHW
std::vector<index_t> window_spatial_lengths,
std::vector<index_t> window_strides,
std::vector<index_t> window_dilations,
std::vector<index_t> input_left_pads,
std::vector<index_t> input_right_pads)
bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams& kernel_params)
{
constexpr index_t InOutRank = 5;
constexpr index_t WindowRank = 3;
if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank ||
window_strides.size() != WindowRank || window_dilations.size() != WindowRank ||
input_left_pads.size() != WindowRank || input_right_pads.size() != WindowRank)
if(kernel_params.in_length.size() != InOutRank ||
kernel_params.window_spatial_lengths.size() != WindowRank ||
kernel_params.window_strides.size() != WindowRank ||
kernel_params.window_dilations.size() != WindowRank ||
kernel_params.input_left_pads.size() != WindowRank ||
kernel_params.input_right_pads.size() != WindowRank)
return false;
std::vector<index_t> out_length(InOutRank);
int N = in_length[0];
int C = in_length[1];
int N = kernel_params.in_length[0];
int C = kernel_params.in_length[1];
out_length[0] = N;
out_length[1] = C;
......@@ -56,18 +70,18 @@ bool profile_pool3d_fwd_impl(int do_verification,
// Calculate Do, Ho, Wo
for(int i = 2; i < InOutRank; ++i)
{
auto pad1 = input_left_pads[i - 2];
auto pad2 = input_right_pads[i - 2];
auto windows_size = window_spatial_lengths[i - 2];
auto windows_stride = window_strides[i - 2];
auto windows_dilation = window_dilations[i - 2];
auto pad1 = kernel_params.input_left_pads[i - 2];
auto pad2 = kernel_params.input_right_pads[i - 2];
auto windows_size = kernel_params.window_spatial_lengths[i - 2];
auto windows_stride = kernel_params.window_strides[i - 2];
auto windows_dilation = kernel_params.window_dilations[i - 2];
auto eff = (windows_size - 1) * windows_dilation + 1;
out_length[i] = (in_length[i] + pad1 + pad2 - eff) / windows_stride + 1;
out_length[i] = (kernel_params.in_length[i] + pad1 + pad2 - eff) / windows_stride + 1;
}
int Di = in_length[2];
int Hi = in_length[3];
int Wi = in_length[4];
int Di = kernel_params.in_length[2];
int Hi = kernel_params.in_length[3];
int Wi = kernel_params.in_length[4];
int Do = out_length[2];
int Ho = out_length[3];
int Wo = out_length[4];
......@@ -88,7 +102,7 @@ bool profile_pool3d_fwd_impl(int do_verification,
Tensor<IndexDataType> out_indices_n_c_do_ho_wo_device(
f_host_tensor_descriptor(N, C, Do, Ho, Wo));
switch(init_method)
switch(in_params.init_method)
{
case 0: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{}); break;
case 1: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
......@@ -125,7 +139,7 @@ bool profile_pool3d_fwd_impl(int do_verification,
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
if(do_verification)
if(in_params.do_verification)
{
using ReferenceInstance = ck::tensor_operation::host::ReferencePoolingFwd<InOutRank,
WindowRank,
......@@ -141,11 +155,11 @@ bool profile_pool3d_fwd_impl(int do_verification,
auto ref_argument = ref.MakeArgument(in_n_c_di_hi_wi,
out_n_c_do_ho_wo_host,
out_indices_n_c_do_ho_wo_host,
window_spatial_lengths,
window_strides,
window_dilations,
input_left_pads,
input_right_pads);
kernel_params.window_spatial_lengths,
kernel_params.window_strides,
kernel_params.window_dilations,
kernel_params.input_left_pads,
kernel_params.input_right_pads);
auto ref_invoker = ref.MakeInvoker();
ref_invoker.Run(ref_argument);
}
......@@ -158,16 +172,16 @@ bool profile_pool3d_fwd_impl(int do_verification,
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
in_length,
window_spatial_lengths,
kernel_params.in_length,
kernel_params.window_spatial_lengths,
out_length,
{Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C},
{Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
{Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
window_strides,
window_dilations,
input_left_pads,
input_right_pads,
kernel_params.window_strides,
kernel_params.window_dilations,
kernel_params.input_left_pads,
kernel_params.input_right_pads,
{2, 3, 4});
if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
......@@ -176,10 +190,11 @@ bool profile_pool3d_fwd_impl(int do_verification,
}
else
{
if(time_kernel)
if(in_params.time_kernel)
{
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
LogRange(std::cout << "input lengths = ", in_length, ", ") << std::endl;
LogRange(std::cout << "input lengths = ", kernel_params.in_length, ", ")
<< std::endl;
}
continue;
......@@ -187,7 +202,8 @@ bool profile_pool3d_fwd_impl(int do_verification,
auto invoker_ptr = inst_ptr->MakeInvokerPointer();
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
float avg_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, in_params.time_kernel});
std::size_t num_bytes = in_n_c_di_hi_wi.mDesc.GetElementSize() * sizeof(InDataType) +
out_n_c_do_ho_wo_host.mDesc.GetElementSize() * sizeof(OutDataType);
......@@ -198,7 +214,7 @@ bool profile_pool3d_fwd_impl(int do_verification,
float gb_per_sec = num_bytes / 1.E6 / avg_time;
if(time_kernel)
if(in_params.time_kernel)
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< inst_ptr->GetTypeString() << std::endl;
......@@ -209,25 +225,25 @@ bool profile_pool3d_fwd_impl(int do_verification,
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
if(in_params.do_verification)
{
out_device_buf.FromDevice(out_n_c_do_ho_wo_device.mData.data());
bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData,
auto tolerance = 1e-3;
bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData,
out_n_c_do_ho_wo_host.mData,
"Error: Incorrect results",
1e-3,
1e-3);
tolerance,
tolerance);
if constexpr(OutputIndex)
{
out_indices_device_buf.FromDevice(out_indices_n_c_do_ho_wo_device.mData.data());
pass = pass && ck::utils::check_err(out_indices_n_c_do_ho_wo_device,
out_indices_n_c_do_ho_wo_host);
}
if(do_log)
if(in_params.do_log)
{
LogRangeAsType<float>(
std::cout << "in_n_c_di_hi_wi : ", in_n_c_di_hi_wi.mData, ",")
......@@ -249,20 +265,21 @@ bool profile_pool3d_fwd_impl(int do_verification,
if(!pass)
{
std::cout << inst_ptr->GetTypeString() << " failed verification: ";
LogRange(std::cout << "lengths = [", in_length, ", ") << "]." << std::endl;
LogRange(std::cout << "lengths = [", kernel_params.in_length, ", ")
<< "]." << std::endl;
return false;
}
else
{
if(time_kernel)
if(in_params.time_kernel)
std::cout << "pass" << std::endl;
}
}
}
if(time_kernel)
if(in_params.time_kernel)
{
LogRange(std::cout << "length = ", in_length, ",") << std::endl;
LogRange(std::cout << "length = ", kernel_params.in_length, ",") << std::endl;
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_instance_name << std::endl;
}
......
......@@ -10,7 +10,7 @@ set(PROFILER_SOURCES
profile_groupnorm_bwd_gamma_beta.cpp
profile_layernorm_fwd.cpp
profile_max_pool2d_fwd.cpp
profile_max_pool3d_fwd.cpp
profile_pool3d_fwd.cpp
profile_avg_pool3d_bwd.cpp
profile_max_pool3d_bwd.cpp
profile_avg_pool2d_bwd.cpp
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <unordered_map>
#include "profiler/data_type_enum.hpp"
#include "profiler/profile_pool3d_fwd_impl.hpp"
#include "profiler_operation_registry.hpp"
using ck::index_t;
struct maxPoolFwdArgParser
{
std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
{"wsize", {}},
{"wstride", {}},
{"wdilation", {}},
{"pad1", {}},
{"pad2", {}}};
bool parse_opt(int argc, char* argv[], const std::string& key, int i)
{
if(std::string("--") + key == argv[i])
{
int pos = i;
while(++i < argc && argv[i][0] != '-') {}
int end = i;
for(int j = pos + 1; j < end; j++)
{
long_opts[key].push_back(std::stoi(argv[j]));
}
return true;
}
return false;
}
void operator()(int argc, char* argv[])
{
for(auto& kv : long_opts)
{
for(int i = 1; i < argc; i++)
{
if(parse_opt(argc, argv, kv.first, i))
break;
}
}
}
};
void print_help_max_pool3d_fwd()
{
std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
<< "arg2: verification (0: no; 1: yes)\n"
<< "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg4: print tensor value (0: no; 1: yes)\n"
<< "arg5: time kernel (0=no, 1=yes)\n"
<< "arg6: return index (0=no, 1=yes)\n"
<< "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
<< "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
<< "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
<< "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
<< "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
<< "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
<< "eg: ckProfiler max_pool3d_fwd 0 1 2 0 1 0 --length 2 32 30 30 30 --wsize 2 2 2 "
"--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
<< std::endl;
}
int profile_max_pool3d_fwd(int argc, char* argv[])
{
ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
bool do_verification = true;
int init_method = 0;
bool do_log = false;
bool time_kernel = true;
bool return_index = false;
std::vector<index_t> in_length = {2, 32, 30, 30, 30};
std::vector<index_t> wsize = {2, 2, 2};
std::vector<index_t> wstride = {2, 2, 2};
std::vector<index_t> wdilation = {1, 1, 1};
std::vector<index_t> pad1 = {1, 1, 1};
std::vector<index_t> pad2 = {1, 1, 1};
if(argc != 2 && argc != 34)
{
print_help_max_pool3d_fwd();
return 0;
}
else if(argc == 34)
{
data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
do_verification = std::stoi(argv[3]);
init_method = std::stoi(argv[4]);
do_log = std::stoi(argv[5]);
time_kernel = std::stoi(argv[6]);
return_index = std::stoi(argv[7]);
// parse the long options
maxPoolFwdArgParser arg_parser;
arg_parser(argc, argv);
in_length = arg_parser.long_opts["length"];
wsize = arg_parser.long_opts["wsize"];
wstride = arg_parser.long_opts["wstride"];
wdilation = arg_parser.long_opts["wdilation"];
pad1 = arg_parser.long_opts["pad1"];
pad2 = arg_parser.long_opts["pad2"];
}
#ifdef CK_ENABLE_FP16
using F16 = ck::half_t;
#endif
#ifdef CK_ENABLE_BF16
using BF16 = ck::bhalf_t;
#endif
#ifdef CK_ENABLE_FP32
using F32 = float;
#endif
using I32 = int32_t;
using NDHWC = ck::tensor_layout::convolution::NDHWC;
#if 1
constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
#else
constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
#endif
if(false)
;
#ifdef CK_ENABLE_FP16
else if(data_type == ck::DataTypeEnum::Half)
{
if(return_index)
ck::profiler::
profile_pool3d_fwd_impl<F16, F16, F16, I32, NDHWC, NDHWC, ReduceOpId, false, true>(
do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
else
ck::profiler::
profile_pool3d_fwd_impl<F16, F16, F16, I32, NDHWC, NDHWC, ReduceOpId, false, false>(
do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
}
#endif
#ifdef CK_ENABLE_BF16
else if(data_type == ck::DataTypeEnum::BFloat16)
{
if(return_index)
ck::profiler::profile_pool3d_fwd_impl<BF16,
BF16,
BF16,
I32,
NDHWC,
NDHWC,
ReduceOpId,
false,
true>(do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
else
ck::profiler::profile_pool3d_fwd_impl<BF16,
BF16,
BF16,
I32,
NDHWC,
NDHWC,
ReduceOpId,
false,
false>(do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
}
#endif
#ifdef CK_ENABLE_FP32
else if(data_type == ck::DataTypeEnum::Float)
{
if(return_index)
ck::profiler::
profile_pool3d_fwd_impl<F32, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, false, true>(
do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
else
ck::profiler::
profile_pool3d_fwd_impl<F32, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, false, false>(
do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
}
#endif
else
{
throw std::runtime_error("not implemented yet");
}
return 0;
}
REGISTER_PROFILER_OPERATION("max_pool3d_fwd", "max_pool3d fwd", profile_max_pool3d_fwd);
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <unordered_map>
#include "profiler/data_type_enum.hpp"
#include "profiler/profile_pool3d_fwd_impl.hpp"
#include "profiler_operation_registry.hpp"
using ck::index_t;
struct poolFwdArgParser
{
std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
{"wsize", {}},
{"wstride", {}},
{"wdilation", {}},
{"pad1", {}},
{"pad2", {}}};
bool parse_opt(int argc, char* argv[], const std::string& key, int i)
{
if(std::string("--") + key == argv[i])
{
int pos = i;
while(++i < argc && argv[i][0] != '-') {}
int end = i;
for(int j = pos + 1; j < end; j++)
{
long_opts[key].push_back(std::stoi(argv[j]));
}
return true;
}
return false;
}
void operator()(int argc, char* argv[])
{
for(auto& kv : long_opts)
{
for(int i = 1; i < argc; i++)
{
if(parse_opt(argc, argv, kv.first, i))
break;
}
}
}
};
void print_help_pool3d_fwd()
{
std::cout << "arg1: data type (0: fp16; 1: fp32; 3: int8; 5: bf16; 7: fp8)\n"
<< "arg2: verification (0: no; 1: yes)\n"
<< "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg4: print tensor value (0: no; 1: yes)\n"
<< "arg5: time kernel (0=no, 1=yes)\n"
<< "arg6: return index (0=no, 1=yes)\n"
<< "arg7: reduce op (0: max; 1: avg)\n"
<< "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
<< "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
<< "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
<< "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
<< "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
<< "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
<< "eg: ckProfiler pool3d_fwd 0 1 2 0 1 0 --length 2 32 30 30 30 --wsize 2 2 2 "
"--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
<< std::endl;
}
int profile_pool3d_fwd(int argc, char* argv[])
{
ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
ck::profiler::PoolFwdInputParams in_params{true, 0, false, true, false, 0};
ck::profiler::PoolFwdKernelParams kernel_params{
{2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
if(argc != 2 && argc != 35)
{
print_help_pool3d_fwd();
return 0;
}
else if(argc == 35)
{
data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
in_params.do_verification = std::stoi(argv[3]);
in_params.init_method = std::stoi(argv[4]);
in_params.do_log = std::stoi(argv[5]);
in_params.time_kernel = std::stoi(argv[6]);
in_params.return_index = std::stoi(argv[7]);
in_params.reduce_op = std::stoi(argv[8]);
// parse the long options
poolFwdArgParser arg_parser;
arg_parser(argc, argv);
kernel_params.in_length = arg_parser.long_opts["length"];
kernel_params.window_spatial_lengths = arg_parser.long_opts["wsize"];
kernel_params.window_strides = arg_parser.long_opts["wstride"];
kernel_params.window_dilations = arg_parser.long_opts["wdilation"];
kernel_params.input_left_pads = arg_parser.long_opts["pad1"];
kernel_params.input_right_pads = arg_parser.long_opts["pad2"];
}
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using F32 = float;
using I8 = int8_t;
using I32 = int32_t;
using F8 = ck::f8_t;
using NDHWC = ck::tensor_layout::convolution::NDHWC;
if(data_type == ck::DataTypeEnum::Half)
{
if(in_params.reduce_op == 1)
{
ck::profiler::profile_pool3d_fwd_impl<F16,
F16,
F32,
I32,
NDHWC,
NDHWC,
ck::ReduceTensorOp::AVG,
false,
false>(in_params, kernel_params);
}
else
{ // reduce_op == 0
if(in_params.return_index)
{
ck::profiler::profile_pool3d_fwd_impl<F16,
F16,
F16,
I32,
NDHWC,
NDHWC,
ck::ReduceTensorOp::MAX,
false,
true>(in_params, kernel_params);
}
else
{
ck::profiler::profile_pool3d_fwd_impl<F16,
F16,
F16,
I32,
NDHWC,
NDHWC,
ck::ReduceTensorOp::MAX,
false,
false>(in_params, kernel_params);
}
}
}
else if(data_type == ck::DataTypeEnum::BFloat16)
{
if(in_params.reduce_op == 1)
{
ck::profiler::profile_pool3d_fwd_impl<BF16,
BF16,
F32,
I32,
NDHWC,
NDHWC,
ck::ReduceTensorOp::AVG,
false,
false>(in_params, kernel_params);
}
else
{ // reduce_op == 0
if(in_params.return_index)
{
ck::profiler::profile_pool3d_fwd_impl<BF16,
BF16,
BF16,
I32,
NDHWC,
NDHWC,
ck::ReduceTensorOp::MAX,
false,
true>(in_params, kernel_params);
}
else
{
ck::profiler::profile_pool3d_fwd_impl<BF16,
BF16,
BF16,
I32,
NDHWC,
NDHWC,
ck::ReduceTensorOp::MAX,
false,
false>(in_params, kernel_params);
}
}
}
else if(data_type == ck::DataTypeEnum::Float)
{
if(in_params.reduce_op == 1)
{
ck::profiler::profile_pool3d_fwd_impl<F32,
F32,
F32,
I32,
NDHWC,
NDHWC,
ck::ReduceTensorOp::AVG,
false,
false>(in_params, kernel_params);
}
else
{ // reduce_op == 0
if(in_params.return_index)
{
ck::profiler::profile_pool3d_fwd_impl<F32,
F32,
F32,
I32,
NDHWC,
NDHWC,
ck::ReduceTensorOp::MAX,
false,
true>(in_params, kernel_params);
}
else
{
ck::profiler::profile_pool3d_fwd_impl<F32,
F32,
F32,
I32,
NDHWC,
NDHWC,
ck::ReduceTensorOp::MAX,
false,
false>(in_params, kernel_params);
}
}
}
else if(data_type == ck::DataTypeEnum::Float8)
{
if(in_params.reduce_op == 1)
{
return ck::profiler::profile_pool3d_fwd_impl<F8,
F8,
F32,
I32,
NDHWC,
NDHWC,
ck::ReduceTensorOp::AVG,
false,
false>(in_params, kernel_params);
}
else
{ // reduce_op == 0
if(in_params.return_index)
{
return ck::profiler::profile_pool3d_fwd_impl<F8,
F8,
F8,
I32,
NDHWC,
NDHWC,
ck::ReduceTensorOp::MAX,
false,
true>(in_params, kernel_params);
}
else
{
return ck::profiler::profile_pool3d_fwd_impl<F8,
F8,
F8,
I32,
NDHWC,
NDHWC,
ck::ReduceTensorOp::MAX,
false,
false>(in_params, kernel_params);
}
}
}
else if(data_type == ck::DataTypeEnum::Int8)
{
if(in_params.reduce_op == 1)
{
return ck::profiler::profile_pool3d_fwd_impl<I8,
I8,
I32,
I32,
NDHWC,
NDHWC,
ck::ReduceTensorOp::AVG,
false,
false>(in_params, kernel_params);
}
else
{ // reduce_op == 0
if(in_params.return_index)
{
return ck::profiler::profile_pool3d_fwd_impl<I8,
I8,
I8,
I32,
NDHWC,
NDHWC,
ck::ReduceTensorOp::MAX,
false,
true>(in_params, kernel_params);
}
else
{
return ck::profiler::profile_pool3d_fwd_impl<I8,
I8,
I8,
I32,
NDHWC,
NDHWC,
ck::ReduceTensorOp::MAX,
false,
false>(in_params, kernel_params);
}
}
}
else
{
throw std::runtime_error("not implemented yet");
}
return 0;
}
REGISTER_PROFILER_OPERATION("pool3d_fwd", "pool3d fwd", profile_pool3d_fwd);
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_pool3d_fwd_impl.hpp"
......@@ -16,10 +16,19 @@ class TestAvgPool3dFwd : public ::testing::Test
std::vector<PoolingParam> params;
ck::profiler::PoolFwdInputParams in_params_avg_pool{true, 2, false, false, false, 1};
void Run()
{
for(auto param : params)
{
ck::profiler::PoolFwdKernelParams kernel_params{param.length_,
param.window_spatial_lengths_,
param.window_strides_,
param.window_dilations_,
param.input_left_pads_,
param.input_right_pads_};
bool success =
ck::profiler::profile_pool3d_fwd_impl<InDataType,
OutDataType,
......@@ -29,26 +38,18 @@ class TestAvgPool3dFwd : public ::testing::Test
ck::tensor_layout::convolution::NDHWC,
ck::ReduceTensorOp::AVG,
false,
false>(true,
2,
false,
false,
param.length_,
param.window_spatial_lengths_,
param.window_strides_,
param.window_dilations_,
param.input_left_pads_,
param.input_right_pads_);
false>(in_params_avg_pool, kernel_params);
EXPECT_TRUE(success);
}
}
};
#ifdef CK_ENABLE_FP16
using KernelTypes =
::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
#else
using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
#endif
using KernelTypes = ::testing::Types<std::tuple<I8, I8, I32, I32>,
std::tuple<F8, F8, F32, I32>,
std::tuple<F16, F16, F32, I32>,
std::tuple<BF16, BF16, F32, I32>,
std::tuple<F32, F32, F32, I32>>;
TYPED_TEST_SUITE(TestAvgPool3dFwd, KernelTypes);
TYPED_TEST(TestAvgPool3dFwd, Test_Pool)
{
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_pool3d_fwd_impl.hpp"
......@@ -16,10 +16,20 @@ class TestMaxPool3dFwd : public ::testing::Test
std::vector<PoolingParam> params;
ck::profiler::PoolFwdInputParams in_params_max_pool{true, 2, false, false, false, 0};
ck::profiler::PoolFwdInputParams in_params_max_pool_indexed{true, 2, false, false, true, 0};
void Run()
{
for(auto param : params)
{
ck::profiler::PoolFwdKernelParams kernel_params{param.length_,
param.window_spatial_lengths_,
param.window_strides_,
param.window_dilations_,
param.input_left_pads_,
param.input_right_pads_};
// max pool
bool success =
ck::profiler::profile_pool3d_fwd_impl<InDataType,
......@@ -30,16 +40,7 @@ class TestMaxPool3dFwd : public ::testing::Test
ck::tensor_layout::convolution::NDHWC,
ck::ReduceTensorOp::MAX,
false,
false>(true,
2,
false,
false,
param.length_,
param.window_spatial_lengths_,
param.window_strides_,
param.window_dilations_,
param.input_left_pads_,
param.input_right_pads_);
false>(in_params_max_pool, kernel_params);
EXPECT_TRUE(success);
// max pool + index
......@@ -51,27 +52,18 @@ class TestMaxPool3dFwd : public ::testing::Test
ck::tensor_layout::convolution::NDHWC,
ck::ReduceTensorOp::MAX,
false,
true>(true,
2,
false,
false,
param.length_,
param.window_spatial_lengths_,
param.window_strides_,
param.window_dilations_,
param.input_left_pads_,
param.input_right_pads_);
true>(in_params_max_pool_indexed,
kernel_params);
EXPECT_TRUE(success);
}
}
};
#ifdef CK_ENABLE_FP16
using KernelTypes =
::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
#else
using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
#endif
using KernelTypes = ::testing::Types<std::tuple<I8, I8, I8, I32>,
std::tuple<F8, F8, F8, I32>,
std::tuple<F16, F16, F16, I32>,
std::tuple<BF16, BF16, BF16, I32>,
std::tuple<F32, F32, F32, I32>>;
TYPED_TEST_SUITE(TestMaxPool3dFwd, KernelTypes);
TYPED_TEST(TestMaxPool3dFwd, Test_Pool)
......
......@@ -4,6 +4,8 @@
#include "gtest/gtest.h"
#include "ck/ck.hpp"
using I8 = int8_t;
using F8 = ck::f8_t;
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using F32 = float;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment