Commit 95a83c6e authored by Adam Osewski's avatar Adam Osewski
Browse files

Merge remote-tracking branch 'origin/develop' into wavelet_model

parents 5b7c2432 892a8d76
...@@ -433,7 +433,7 @@ bool profile_convnd_bwd_weight_impl(int do_verification, ...@@ -433,7 +433,7 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
{ {
wei_device_buf.FromDevice(weights_device_result.mData.data()); wei_device_buf.FromDevice(weights_device_result.mData.data());
success = ck::utils::check_err(weights_host_result.mData, weights_device_result.mData); success = ck::utils::check_err(weights_host_result, weights_device_result);
if(success == false) if(success == false)
{ {
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
namespace ck {
namespace profiler {
template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
void host_elementwise2D(HostTensorC& C,
const HostTensorA& A,
const HostTensorB& B,
const std::vector<std::size_t>& shape,
Functor functor)
{
using ctype = ck::remove_reference_t<decltype(C(0, 0))>;
for(std::size_t m = 0; m < shape[0]; ++m)
for(std::size_t n = 0; n < shape[1]; ++n)
{
auto a_val = A(m, n);
auto b_val = B(m, n);
ctype c_val = 0;
functor(c_val, a_val, b_val);
C(m, n) = c_val;
}
}
template <typename ADataType,
typename BDataType,
typename GammaDataType,
typename BetaDataType,
typename AccDataType,
typename YDataType>
bool profile_elementwise_layernorm_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
std::vector<index_t> length)
{
using Add = ck::tensor_operation::element_wise::Add;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
if(length.size() != 2)
return false;
index_t M = length[0];
index_t N = length[1];
index_t Stride = N;
constexpr int Rank = 2;
constexpr int NumReduceDim = 1;
std::vector<index_t> reduce_dim = {1};
std::vector<index_t> gammaBetaLength = {N};
std::vector<index_t> gammaBetaStride = {0, 1};
auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) {
using namespace ck::literals;
return HostTensorDescriptor({row, col}, {stride, 1_uz});
};
Tensor<ADataType> a(length);
Tensor<BDataType> b(length);
Tensor<GammaDataType> gamma(gammaBetaLength);
Tensor<BetaDataType> beta(gammaBetaLength);
Tensor<YDataType> y(length);
Tensor<YDataType> host_y(length);
switch(init_method)
{
case 0:
a.GenerateTensorValue(GeneratorTensor_1<ADataType>{});
b.GenerateTensorValue(GeneratorTensor_1<BDataType>{});
gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{});
beta.GenerateTensorValue(GeneratorTensor_1<BetaDataType>{});
break;
case 1:
a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
gamma.GenerateTensorValue(GeneratorTensor_2<GammaDataType>{-5, 5});
beta.GenerateTensorValue(GeneratorTensor_2<BetaDataType>{-5, 5});
break;
default:
a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0, 1});
b.GenerateTensorValue(GeneratorTensor_3<BDataType>{0, 1});
gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{-0.5, 0.5});
beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-0.5, 0.5});
}
DeviceMem a_dev(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
DeviceMem b_dev(sizeof(ADataType) * b.mDesc.GetElementSpaceSize());
DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
a_dev.ToDevice(a.mData.data());
b_dev.ToDevice(b.mData.data());
gamma_dev.ToDevice(gamma.mData.data());
beta_dev.ToDevice(beta.mData.data());
std::array<const void*, 2> input = {a_dev.GetDeviceBuffer(), b_dev.GetDeviceBuffer()};
// add device normalization instances
using DeviceOp = ck::tensor_operation::device::DeviceElementwiseNormalization<
ck::Tuple<ADataType, BDataType>,
GammaDataType,
BetaDataType,
AccDataType,
YDataType,
Add,
PassThrough,
2,
1>;
// get device op instances
const auto instance_ptrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
if(do_verification)
{
using XDataType = ADataType;
std::vector<std::size_t> mn = {static_cast<unsigned long>(M),
static_cast<unsigned long>(N)};
Tensor<XDataType> x(f_host_tensor_descriptor2d(M, N, Stride));
host_elementwise2D<Tensor<ADataType>, Tensor<BDataType>, Tensor<XDataType>, Add>(
x, a, b, mn, Add{});
using ReferenceInstance = ck::tensor_operation::host::ReferenceLayernorm<XDataType,
GammaDataType,
BetaDataType,
YDataType,
AccDataType,
PassThrough,
Rank,
NumReduceDim>;
ReferenceInstance ref;
auto ref_argument =
ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, {M, N}, {1}, 1e-4);
auto ref_invoker = ref.MakeInvoker();
ref_invoker.Run(ref_argument);
}
int num_kernel = 0;
for(auto& inst_ptr : instance_ptrs)
{
auto argument_ptr = inst_ptr->MakeArgumentPointer(
length,
{
std::vector<ck::index_t>{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()},
std::vector<ck::index_t>{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()},
},
gammaBetaStride,
gammaBetaStride,
std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
reduce_dim,
1e-4,
input,
gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(),
Add{},
PassThrough{});
if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
{
++num_kernel;
}
else
{
continue;
}
auto invoker_ptr = inst_ptr->MakeInvokerPointer();
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_bytes = a.mDesc.GetElementSize() * sizeof(ADataType) +
b.mDesc.GetElementSize() * sizeof(BDataType) +
gamma.mDesc.GetElementSize() * sizeof(GammaDataType) +
beta.mDesc.GetElementSize() * sizeof(BetaDataType) +
y.mDesc.GetElementSize() * sizeof(YDataType);
float gb_per_sec = num_bytes / 1.E6 / avg_time;
if(time_kernel)
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< inst_ptr->GetTypeString() << std::endl;
if(avg_time < best_avg_time)
{
best_instance_name = inst_ptr->GetTypeString();
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
{
y_dev.FromDevice(y.mData.data());
bool pass =
ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b : ", b.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_y : ", host_y.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "y : ", y.mData, ",") << std::endl;
}
if(!pass)
{
std::cout << inst_ptr->GetTypeString() << " failed verification: ";
LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
return false;
}
else
{
if(time_kernel)
std::cout << "pass" << std::endl;
}
}
}
if(time_kernel)
{
LogRange(std::cout << "length = ", length, ",") << ", ";
std::cout << "num_kernel = " << num_kernel << ", best perf = " << best_avg_time << " ms, "
<< best_gb_per_sec << " GB/s, " << best_instance_name << std::endl;
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is tested" << std::endl;
return false;
}
return true;
}
} // namespace profiler
} // namespace ck
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -47,15 +48,15 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -47,15 +48,15 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
{ {
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -121,8 +122,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -121,8 +122,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
// run reference // run reference
if(do_verification) if(do_verification)
{ {
Tensor<AccDataType> c_m_n(HostTensorDescriptor( Tensor<AccDataType> c_m_n({M, N});
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
...@@ -223,8 +223,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -223,8 +223,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
{ {
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.mData.data());
pass = pass && pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
} }
} }
else else
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -75,21 +76,20 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -75,21 +76,20 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
int StrideD0) int StrideD0)
{ {
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
return HostTensorDescriptor(std::vector<std::size_t>({len}), return HostTensorDescriptor({len}, {stride});
std::vector<std::size_t>({stride}));
}; };
auto f_host_tensor_descriptor2d = auto f_host_tensor_descriptor2d =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -99,16 +99,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -99,16 +99,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1)); Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_host_result( Tensor<ReduceDataType> reduce0_m_host_result({M});
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> reduce1_m_host_result({M});
Tensor<ReduceDataType> reduce1_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_device_result( Tensor<ReduceDataType> reduce0_m_device_result({M});
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> reduce1_m_device_result({M});
Tensor<ReduceDataType> reduce1_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
...@@ -347,9 +343,9 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -347,9 +343,9 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data()); reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data()); reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData); ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData); ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
if(do_log) if(do_log)
{ {
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -46,15 +47,15 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -46,15 +47,15 @@ bool profile_gemm_bilinear_impl(int do_verification,
{ {
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -116,8 +117,7 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -116,8 +117,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
// run reference // run reference
if(do_verification) if(do_verification)
{ {
Tensor<AccDataType> c_m_n(HostTensorDescriptor( Tensor<AccDataType> c_m_n({M, N});
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
...@@ -215,8 +215,7 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -215,8 +215,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
{ {
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.mData.data());
pass = pass && pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
} }
} }
else else
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -45,15 +46,15 @@ int profile_gemm_impl(int do_verification, ...@@ -45,15 +46,15 @@ int profile_gemm_impl(int do_verification,
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -187,8 +188,7 @@ int profile_gemm_impl(int do_verification, ...@@ -187,8 +188,7 @@ int profile_gemm_impl(int do_verification,
{ {
c_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass = pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -75,15 +76,15 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -75,15 +76,15 @@ bool profile_gemm_reduce_impl(int do_verification,
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -91,16 +92,12 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -91,16 +92,12 @@ bool profile_gemm_reduce_impl(int do_verification,
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_host_result( Tensor<ReduceDataType> reduce0_m_host_result({M});
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> reduce1_m_host_result({M});
Tensor<ReduceDataType> reduce1_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_device_result( Tensor<ReduceDataType> reduce0_m_device_result({M});
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> reduce1_m_device_result({M});
Tensor<ReduceDataType> reduce1_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
...@@ -313,9 +310,9 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -313,9 +310,9 @@ bool profile_gemm_reduce_impl(int do_verification,
reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data()); reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data()); reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData); ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData); ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
if(do_log) if(do_log)
{ {
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -46,15 +47,15 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -46,15 +47,15 @@ bool profile_gemm_splitk_impl(int do_verification,
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -190,8 +191,7 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -190,8 +191,7 @@ bool profile_gemm_splitk_impl(int do_verification,
{ {
c_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass = pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -3,9 +3,10 @@ ...@@ -3,9 +3,10 @@
#pragma once #pragma once
#include "ck/ck.hpp" #include <algorithm>
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
#include <iterator>
#include <typeinfo> #include <typeinfo>
#include "ck/ck.hpp" #include "ck/ck.hpp"
...@@ -13,7 +14,7 @@ ...@@ -13,7 +14,7 @@
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/convolution_backward_weight.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -26,32 +27,6 @@ ...@@ -26,32 +27,6 @@
namespace ck { namespace ck {
namespace profiler { namespace profiler {
template <typename DataType>
void show_data_nhwc_layout(Tensor<DataType>& nhwc)
{
std::cout << "[";
for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
{
std::cout << "[";
for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
{
std::cout << "[";
for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
{
std::cout << "[";
for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
{
std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << " ";
}
std::cout << "]";
}
std::cout << "]";
}
std::cout << "]";
}
std::cout << "]";
}
template <ck::index_t NDimSpatial, template <ck::index_t NDimSpatial,
typename InLayout, typename InLayout,
typename WeiLayout, typename WeiLayout,
...@@ -59,12 +34,12 @@ template <ck::index_t NDimSpatial, ...@@ -59,12 +34,12 @@ template <ck::index_t NDimSpatial,
typename InDataType, typename InDataType,
typename WeiDataType, typename WeiDataType,
typename OutDataType> typename OutDataType>
bool profile_conv_bwd_weight_impl(int do_verification, bool profile_grouped_conv_bwd_weight_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
bool time_kernel, bool time_kernel,
const ck::utils::conv::ConvParam& conv_param, const ck::utils::conv::ConvParam& conv_param,
ck::index_t split_k) ck::index_t split_k)
{ {
using InElementOp = ck::tensor_operation::element_wise::PassThrough; using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
...@@ -114,16 +89,14 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -114,16 +89,14 @@ bool profile_conv_bwd_weight_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial, auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp>{}; OutElementOp>{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input, auto ref_argument = ref_conv.MakeArgument(input,
weight_host_result, weight_host_result,
output, output,
...@@ -138,16 +111,16 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -138,16 +111,16 @@ bool profile_conv_bwd_weight_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
using DeviceOp = ck::tensor_operation::device::DeviceConvBwdWeight<NDimSpatial, using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
OutLayout, OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp>; OutElementOp>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
...@@ -163,22 +136,41 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -163,22 +136,41 @@ bool profile_conv_bwd_weight_impl(int do_verification,
// profile device Conv instances // profile device Conv instances
bool all_pass = true; bool all_pass = true;
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto range_copy = [](const auto& from, auto to) { std::copy(begin(from), end(from), to); };
range_copy(conv_param.input_spatial_lengths_, begin(input_spatial_lengths));
range_copy(conv_param.filter_spatial_lengths_, begin(filter_spatial_lengths));
range_copy(conv_param.output_spatial_lengths_, begin(output_spatial_lengths));
range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
range_copy(conv_param.input_left_pads_, begin(input_left_pads));
range_copy(conv_param.input_right_pads_, begin(input_right_pads));
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = auto argument_ptr =
op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()), static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
conv_param.G_,
conv_param.N_, conv_param.N_,
conv_param.K_, conv_param.K_,
conv_param.C_, conv_param.C_,
conv_param.input_spatial_lengths_, input_spatial_lengths,
conv_param.filter_spatial_lengths_, filter_spatial_lengths,
conv_param.output_spatial_lengths_, output_spatial_lengths,
conv_param.conv_filter_strides_, conv_filter_strides,
conv_param.conv_filter_dilations_, conv_filter_dilations,
conv_param.input_left_pads_, input_left_pads,
conv_param.input_right_pads_, input_right_pads,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op, out_element_op,
...@@ -217,33 +209,29 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -217,33 +209,29 @@ bool profile_conv_bwd_weight_impl(int do_verification,
{ {
wei_device_buf.FromDevice(weight_device_result.mData.data()); wei_device_buf.FromDevice(weight_device_result.mData.data());
bool pass = bool pass = ck::utils::check_err(weight_device_result, weight_host_result);
ck::utils::check_err(weight_host_result.mData, weight_device_result.mData);
if(!pass) if(!pass)
{ {
std::cout << "Fail info:" << op_ptr->GetTypeString() << std::endl; std::cout << "Fail info: " << op_ptr->GetTypeString() << std::endl;
} }
all_pass &= pass; all_pass &= pass;
if(do_log) if(do_log)
{ {
std::cout << "in : "; LogRangeAsType<float>(std::cout << "output : ", output.mData, ",") << std::endl;
show_data_nhwc_layout(output); ;
std::cout << std::endl; LogRangeAsType<float>(
std::cout << "weight (device): ", weight_device_result.mData, ",")
std::cout << "wei: "; << std::endl;
show_data_nhwc_layout(weight_host_result); ;
std::cout << std::endl; LogRangeAsType<float>(
std::cout << "weight (host): ", weight_host_result.mData, ",")
std::cout << "out : "; << std::endl;
show_data_nhwc_layout(input); ;
std::cout << std::endl; LogRangeAsType<float>(std::cout << "input: ", input.mData, ",") << std::endl;
;
std::cout << "wei_device: ";
show_data_nhwc_layout(weight_device_result);
std::cout << std::endl;
} }
} }
} }
......
...@@ -9,11 +9,12 @@ ...@@ -9,11 +9,12 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
...@@ -66,7 +67,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -66,7 +67,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
std::array<ck::index_t, NDimSpatial> input_left_pads{}; std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{}; std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); }; auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
...@@ -136,25 +137,6 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -136,25 +137,6 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name; std::string best_op_name;
float best_avg_time = 0; float best_avg_time = 0;
float best_tflops = 0; float best_tflops = 0;
...@@ -163,29 +145,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -163,29 +145,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
// profile device op instances // profile device op instances
bool pass = true; bool pass = true;
for(auto& op_ptr : op_ptrs) auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
{
auto argument_ptr =
op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
std::array<const void*, 0>{},
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
// re-init output to zero before profiling next kernel // re-init output to zero before profiling next kernel
...@@ -220,7 +180,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -220,7 +180,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
{ {
out_device_buf.FromDevice(device_output.mData.data()); out_device_buf.FromDevice(device_output.mData.data());
pass = pass & ck::utils::check_err(device_output.mData, host_output.mData); pass = pass & ck::utils::check_err(device_output, host_output);
if(do_log) if(do_log)
{ {
...@@ -237,6 +197,95 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -237,6 +197,95 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
{ {
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl; std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
} }
};
// xdl
{
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "xdl found " << op_ptrs.size() << " instances" << std::endl;
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
{},
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
{},
{},
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
run_impl(op_ptr, argument_ptr);
}
}
// dl
{
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwd<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "dl found " << op_ptrs.size() << " instances" << std::endl;
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
run_impl(op_ptr, argument_ptr);
}
} }
std::cout << "Best configuration parameters:" std::cout << "Best configuration parameters:"
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -45,15 +46,15 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -45,15 +46,15 @@ bool profile_grouped_gemm_impl(int do_verification,
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -257,8 +258,7 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -257,8 +258,7 @@ bool profile_grouped_gemm_impl(int do_verification,
c_element_op); c_element_op);
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
pass = pass && ck::utils::check_err(c_m_n_device_results[i].mData, pass = pass && ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);
c_m_n_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -126,6 +126,8 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -126,6 +126,8 @@ bool profile_groupnorm_impl(int do_verification,
gamma_dev.GetDeviceBuffer(), gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(), beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(), y_dev.GetDeviceBuffer(),
nullptr,
nullptr,
PassThrough{}); PassThrough{});
if(inst_ptr->IsSupportedArgument(argument_ptr.get())) if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
...@@ -163,8 +165,7 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -163,8 +165,7 @@ bool profile_groupnorm_impl(int do_verification,
{ {
y_dev.FromDevice(y.mData.data()); y_dev.FromDevice(y.mData.data());
bool pass = bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
if(do_log) if(do_log)
{ {
...@@ -196,7 +197,7 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -196,7 +197,7 @@ bool profile_groupnorm_impl(int do_verification,
if(num_kernel == 0) if(num_kernel == 0)
{ {
std::cout << "Error: No kernel is tested" << std::endl; std::cout << "Error: No kernel is applicable" << std::endl;
return false; return false;
} }
......
...@@ -22,7 +22,7 @@ template <typename XDataType, ...@@ -22,7 +22,7 @@ template <typename XDataType,
typename AccDataType, typename AccDataType,
typename YDataType, typename YDataType,
index_t Rank> index_t Rank>
void profile_layernorm_impl(int do_verification, bool profile_layernorm_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
bool time_kernel, bool time_kernel,
...@@ -31,7 +31,7 @@ void profile_layernorm_impl(int do_verification, ...@@ -31,7 +31,7 @@ void profile_layernorm_impl(int do_verification,
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
if(length.size() < 2) if(length.size() < 2)
return; return false;
// Assume normalize dimension except for batch (first) dimension // Assume normalize dimension except for batch (first) dimension
std::vector<index_t> reduce_length{length.begin() + 1, length.end()}; std::vector<index_t> reduce_length{length.begin() + 1, length.end()};
...@@ -52,7 +52,6 @@ void profile_layernorm_impl(int do_verification, ...@@ -52,7 +52,6 @@ void profile_layernorm_impl(int do_verification,
switch(init_method) switch(init_method)
{ {
// case 0: break;
case 0: case 0:
x.GenerateTensorValue(GeneratorTensor_1<XDataType>{}); x.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{}); gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{});
...@@ -122,6 +121,8 @@ void profile_layernorm_impl(int do_verification, ...@@ -122,6 +121,8 @@ void profile_layernorm_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
int num_kernel = 0;
for(auto& inst_ptr : instance_ptrs) for(auto& inst_ptr : instance_ptrs)
{ {
auto argument_ptr = inst_ptr->MakeArgumentPointer(length, auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
...@@ -135,12 +136,21 @@ void profile_layernorm_impl(int do_verification, ...@@ -135,12 +136,21 @@ void profile_layernorm_impl(int do_verification,
gamma_dev.GetDeviceBuffer(), gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(), beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(), y_dev.GetDeviceBuffer(),
nullptr,
nullptr,
PassThrough{}); PassThrough{});
if(!inst_ptr->IsSupportedArgument(argument_ptr.get())) if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
{
++num_kernel;
}
else
{ {
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: "; if(time_kernel)
LogRange(std::cout << "input lengths = ", length, ", ") << std::endl; {
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
}
continue; continue;
} }
...@@ -156,8 +166,9 @@ void profile_layernorm_impl(int do_verification, ...@@ -156,8 +166,9 @@ void profile_layernorm_impl(int do_verification,
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " if(time_kernel)
<< inst_ptr->GetTypeString() << std::endl; std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< inst_ptr->GetTypeString() << std::endl;
if(avg_time < best_avg_time) if(avg_time < best_avg_time)
{ {
...@@ -184,20 +195,32 @@ void profile_layernorm_impl(int do_verification, ...@@ -184,20 +195,32 @@ void profile_layernorm_impl(int do_verification,
{ {
std::cout << inst_ptr->GetTypeString() << " failed verification: "; std::cout << inst_ptr->GetTypeString() << " failed verification: ";
LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl; LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
return; return false;
} }
else else
{ {
std::cout << "pass" << std::endl; if(time_kernel)
std::cout << "pass" << std::endl;
} }
} }
} }
LogRange(std::cout << "length = ", length, ",") << ", "; if(time_kernel)
LogRange(std::cout << "stride = ", strideXY, ",") << ", "; {
LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl; LogRange(std::cout << "length = ", length, ",") << ", ";
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, " LogRange(std::cout << "stride = ", strideXY, ",") << ", ";
<< best_instance_name << std::endl; LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl;
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_instance_name << std::endl;
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
return true;
} }
} // namespace profiler } // namespace profiler
......
...@@ -6,8 +6,9 @@ ...@@ -6,8 +6,9 @@
#include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp" #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_reduction.hpp" #include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/host_common_util.hpp" #include "ck/library/utility/host_common_util.hpp"
...@@ -18,57 +19,61 @@ namespace tensor_operation { ...@@ -18,57 +19,61 @@ namespace tensor_operation {
namespace device { namespace device {
namespace instance { namespace instance {
template <int Rank, int NumReduceDim, int ReduceOpId, bool PropagateNan, bool UseIndex> template <index_t Rank,
index_t NumReduceDim,
ReduceTensorOp ReduceOpId,
bool PropagateNan,
bool UseIndex>
struct ReduceDescription struct ReduceDescription
{ {
static constexpr int Rank_ = Rank; static constexpr index_t Rank_ = Rank;
static constexpr int NumReduceDim_ = NumReduceDim; static constexpr index_t NumReduceDim_ = NumReduceDim;
static constexpr int ReduceOpId_ = ReduceOpId; static constexpr ReduceTensorOp ReduceOpId_ = ReduceOpId;
static constexpr int PropagateNan_ = PropagateNan; static constexpr bool PropagateNan_ = PropagateNan;
static constexpr int UseIndex_ = UseIndex; static constexpr bool UseIndex_ = UseIndex;
}; };
using reduce_description_instances = using reduce_description_instances =
std::tuple<ReduceDescription<4, 3, 0, false, false>, // for ADD std::tuple<ReduceDescription<4, 3, ReduceTensorOp::ADD, false, false>, // for ADD
ReduceDescription<4, 4, 0, false, false>, ReduceDescription<4, 4, ReduceTensorOp::ADD, false, false>,
ReduceDescription<4, 1, 0, false, false>, ReduceDescription<4, 1, ReduceTensorOp::ADD, false, false>,
ReduceDescription<2, 1, 0, false, false>, ReduceDescription<2, 1, ReduceTensorOp::ADD, false, false>,
ReduceDescription<4, 3, 5, false, false>, // for AVG ReduceDescription<4, 3, ReduceTensorOp::AVG, false, false>, // for AVG
ReduceDescription<4, 4, 5, false, false>, ReduceDescription<4, 4, ReduceTensorOp::AVG, false, false>,
ReduceDescription<4, 1, 5, false, false>, ReduceDescription<4, 1, ReduceTensorOp::AVG, false, false>,
ReduceDescription<2, 1, 5, false, false>, ReduceDescription<2, 1, ReduceTensorOp::AVG, false, false>,
ReduceDescription<4, 3, 7, false, false>, // for NORM2 ReduceDescription<4, 3, ReduceTensorOp::NORM2, false, false>, // for NORM2
ReduceDescription<4, 4, 7, false, false>, ReduceDescription<4, 4, ReduceTensorOp::NORM2, false, false>,
ReduceDescription<4, 1, 7, false, false>, ReduceDescription<4, 1, ReduceTensorOp::NORM2, false, false>,
ReduceDescription<2, 1, 7, false, false>, ReduceDescription<2, 1, ReduceTensorOp::NORM2, false, false>,
ReduceDescription<4, 3, 2, false, false>, // for MIN ReduceDescription<4, 3, ReduceTensorOp::MIN, false, false>, // for MIN
ReduceDescription<4, 4, 2, false, false>, ReduceDescription<4, 4, ReduceTensorOp::MIN, false, false>,
ReduceDescription<4, 1, 2, false, false>, ReduceDescription<4, 1, ReduceTensorOp::MIN, false, false>,
ReduceDescription<2, 1, 2, false, false>, ReduceDescription<2, 1, ReduceTensorOp::MIN, false, false>,
ReduceDescription<4, 3, 3, false, false>, // for MAX ReduceDescription<4, 3, ReduceTensorOp::MAX, false, false>, // for MAX
ReduceDescription<4, 4, 3, false, false>, ReduceDescription<4, 4, ReduceTensorOp::MAX, false, false>,
ReduceDescription<4, 1, 3, false, false>, ReduceDescription<4, 1, ReduceTensorOp::MAX, false, false>,
ReduceDescription<2, 1, 3, false, false>, ReduceDescription<2, 1, ReduceTensorOp::MAX, false, false>,
ReduceDescription<4, 3, 4, false, false>, // for AMAX ReduceDescription<4, 3, ReduceTensorOp::AMAX, false, false>, // for AMAX
ReduceDescription<4, 4, 4, false, false>, ReduceDescription<4, 4, ReduceTensorOp::AMAX, false, false>,
ReduceDescription<4, 1, 4, false, false>, ReduceDescription<4, 1, ReduceTensorOp::AMAX, false, false>,
ReduceDescription<2, 1, 4, false, false>, ReduceDescription<2, 1, ReduceTensorOp::AMAX, false, false>,
ReduceDescription<4, 3, 2, false, true>, // for MIN ReduceDescription<4, 3, ReduceTensorOp::MIN, false, true>, // for MIN
ReduceDescription<4, 4, 2, false, true>, ReduceDescription<4, 4, ReduceTensorOp::MIN, false, true>,
ReduceDescription<4, 1, 2, false, true>, ReduceDescription<4, 1, ReduceTensorOp::MIN, false, true>,
ReduceDescription<2, 1, 2, false, true>, ReduceDescription<2, 1, ReduceTensorOp::MIN, false, true>,
ReduceDescription<4, 3, 3, false, true>, // for MAX ReduceDescription<4, 3, ReduceTensorOp::MAX, false, true>, // for MAX
ReduceDescription<4, 4, 3, false, true>, ReduceDescription<4, 4, ReduceTensorOp::MAX, false, true>,
ReduceDescription<4, 1, 3, false, true>, ReduceDescription<4, 1, ReduceTensorOp::MAX, false, true>,
ReduceDescription<2, 1, 3, false, true>, ReduceDescription<2, 1, ReduceTensorOp::MAX, false, true>,
ReduceDescription<4, 3, 4, false, true>, // for AMAX ReduceDescription<4, 3, ReduceTensorOp::AMAX, false, true>, // for AMAX
ReduceDescription<4, 4, 4, false, true>, ReduceDescription<4, 4, ReduceTensorOp::AMAX, false, true>,
ReduceDescription<4, 1, 4, false, true>, ReduceDescription<4, 1, ReduceTensorOp::AMAX, false, true>,
ReduceDescription<2, 1, 4, false, true>>; ReduceDescription<2, 1, ReduceTensorOp::AMAX, false, true>>;
template <typename DescriptionType> template <typename DescriptionType>
bool description_match(const DescriptionType& description, bool description_match(const DescriptionType& description,
...@@ -78,9 +83,8 @@ bool description_match(const DescriptionType& description, ...@@ -78,9 +83,8 @@ bool description_match(const DescriptionType& description,
bool PropagateNan, bool PropagateNan,
bool UseIndex) bool UseIndex)
{ {
if(description.Rank_ != Rank || description.ReduceOpId_ != static_cast<int>(ReduceOpId) || if(description.Rank_ != Rank || description.ReduceOpId_ != ReduceOpId ||
description.PropagateNan_ != static_cast<int>(PropagateNan) || description.PropagateNan_ != PropagateNan || description.UseIndex_ != UseIndex)
description.UseIndex_ != static_cast<int>(UseIndex))
return (false); return (false);
if(DescriptionType::NumReduceDim_ != reduceDims.size()) if(DescriptionType::NumReduceDim_ != reduceDims.size())
...@@ -99,11 +103,10 @@ bool description_match(const DescriptionType& description, ...@@ -99,11 +103,10 @@ bool description_match(const DescriptionType& description,
namespace ck { namespace ck {
namespace profiler { namespace profiler {
template <index_t Rank, index_t NumReduceDim> template <int Rank, int NumReduceDim>
static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims) static inline std::array<int, Rank - NumReduceDim>
get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
{ {
assert(NumReduceDim == reduceDims.size());
int reduceFlag = 0; int reduceFlag = 0;
// flag the bits for the reduceDims // flag the bits for the reduceDims
...@@ -112,13 +115,15 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce ...@@ -112,13 +115,15 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
reduceFlag |= 1 << reduceDims[i]; reduceFlag |= 1 << reduceDims[i];
}; };
std::vector<int> invariantDims; std::array<int, Rank - NumReduceDim> invariantDims;
// collect invariant dimensions // collect invariant dimensions
int dim = 0;
for(int i = 0; i < Rank; i++) for(int i = 0; i < Rank; i++)
if((reduceFlag & (1 << i)) == 0) if((reduceFlag & (1 << i)) == 0)
{ {
invariantDims.push_back(i); invariantDims[dim] = i;
dim++;
}; };
return invariantDims; return invariantDims;
...@@ -137,7 +142,7 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -137,7 +142,7 @@ bool profile_reduce_impl_impl(bool do_verification,
bool do_dumpout, bool do_dumpout,
bool time_kernel, bool time_kernel,
const std::vector<size_t>& inLengths, const std::vector<size_t>& inLengths,
const std::vector<int>& reduceDims, const std::array<int, NumReduceDim>& reduceDims,
float alpha, float alpha,
float beta) float beta)
{ {
...@@ -145,6 +150,8 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -145,6 +150,8 @@ bool profile_reduce_impl_impl(bool do_verification,
using namespace ck::tensor_operation::device::instance; using namespace ck::tensor_operation::device::instance;
using ck::host_common::dumpBufferToFile; using ck::host_common::dumpBufferToFile;
constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
constexpr bool op_support_indices = constexpr bool op_support_indices =
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp::AMAX); ReduceOpId == ReduceTensorOp::AMAX);
...@@ -279,28 +286,32 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -279,28 +286,32 @@ bool profile_reduce_impl_impl(bool do_verification,
reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator( reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
static_cast<int32_t>(reduce_total_length)); static_cast<int32_t>(reduce_total_length));
using DeviceReduceInstPtr0 = using DeviceReduceInstPtr =
DeviceReducePtr<InElementwiseOperation, AccElementwiseOperation>; DeviceReducePtr<Rank, NumReduceDim, InElementwiseOperation, AccElementwiseOperation>;
std::vector<DeviceReduceInstPtr0> reduce0_ptrs; std::vector<DeviceReduceInstPtr> reduce_ptrs;
add_device_reduce_instance_threadwise<InDataType, add_device_reduce_instance_threadwise<InDataType,
AccDataType, AccDataType,
OutDataType, OutDataType,
Rank, Rank,
NumReduceDim, NumReduceDim,
ReduceOpId, ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
PropagateNan, PropagateNan,
UseIndex>(reduce0_ptrs); UseIndex>(reduce_ptrs);
add_device_reduce_instance_blockwise<InDataType, add_device_reduce_instance_blockwise<InDataType,
AccDataType, AccDataType,
OutDataType, OutDataType,
Rank, Rank,
NumReduceDim, NumReduceDim,
ReduceOpId, ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
PropagateNan, PropagateNan,
UseIndex>(reduce0_ptrs); UseIndex>(reduce_ptrs);
if constexpr(use_atomic_add) if constexpr(use_atomic_add)
{ {
...@@ -309,12 +320,14 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -309,12 +320,14 @@ bool profile_reduce_impl_impl(bool do_verification,
OutDataType, OutDataType,
Rank, Rank,
NumReduceDim, NumReduceDim,
ReduceOpId, ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
PropagateNan, PropagateNan,
UseIndex>(reduce0_ptrs); UseIndex>(reduce_ptrs);
} }
if(reduce0_ptrs.empty()) if(reduce_ptrs.empty())
{ {
throw std::runtime_error("Wrong! No device REDUCE instance found"); throw std::runtime_error("Wrong! No device REDUCE instance found");
}; };
...@@ -342,22 +355,22 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -342,22 +355,22 @@ bool profile_reduce_impl_impl(bool do_verification,
acc_elementwise_op); acc_elementwise_op);
}; };
std::vector<ck::index_t> i_inLengths; std::array<index_t, Rank> arrInLengths;
std::vector<ck::index_t> i_inStrides; std::array<index_t, Rank> arrInStrides;
std::vector<ck::index_t> i_outLengths; std::array<index_t, NumOutDim> arrOutLengths;
std::vector<ck::index_t> i_outStrides; std::array<index_t, NumOutDim> arrOutStrides;
i_inLengths.assign(inLengths.begin(), inLengths.end()); ck::ranges::copy(inLengths, arrInLengths.begin());
i_inStrides.assign(inStrides.begin(), inStrides.end()); ck::ranges::copy(inStrides, arrInStrides.begin());
i_outLengths.assign(outLengths.begin(), outLengths.end()); ck::ranges::copy(outLengths, arrOutLengths.begin());
i_outStrides.assign(outStrides.begin(), outStrides.end()); ck::ranges::copy(outStrides, arrOutStrides.begin());
for(auto& reduce_ptr : reduce0_ptrs) for(auto& reduce_ptr : reduce_ptrs)
{ {
auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, auto argument_ptr = reduce_ptr->MakeArgumentPointer(arrInLengths,
i_inStrides, arrInStrides,
i_outLengths, arrOutLengths,
i_outStrides, arrOutStrides,
reduceDims, reduceDims,
alpha, alpha,
beta, beta,
...@@ -399,13 +412,12 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -399,13 +412,12 @@ bool profile_reduce_impl_impl(bool do_verification,
bool single_pass; bool single_pass;
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.mData.data());
single_pass = ck::utils::check_err(out.mData, out_ref.mData); single_pass = ck::utils::check_err(out, out_ref);
if(OutputIndex) if(OutputIndex)
{ {
out_indices_dev.FromDevice(out_indices.mData.data()); out_indices_dev.FromDevice(out_indices.mData.data());
single_pass = single_pass && single_pass = single_pass && ck::utils::check_err(out_indices, out_indices_ref);
ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
}; };
if(!single_pass) if(!single_pass)
...@@ -478,22 +490,25 @@ bool profile_reduce_impl(bool do_verification, ...@@ -478,22 +490,25 @@ bool profile_reduce_impl(bool do_verification,
descType{}, inLengths.size(), reduceDims, ReduceOpId, PropagateNan, UseIndex)) descType{}, inLengths.size(), reduceDims, ReduceOpId, PropagateNan, UseIndex))
return; return;
pass = pass && std::array<ck::index_t, descType::NumReduceDim_> arrReduceDims;
profile_reduce_impl_impl<InDataType,
AccDataType, ck::ranges::copy(reduceDims, arrReduceDims.begin());
OutDataType,
descType::Rank_, pass = pass && profile_reduce_impl_impl<InDataType,
descType::NumReduceDim_, AccDataType,
static_cast<ReduceTensorOp>(descType::ReduceOpId_), OutDataType,
static_cast<bool>(descType::PropagateNan_), descType::Rank_,
static_cast<bool>(descType::UseIndex_)>(do_verification, descType::NumReduceDim_,
init_method, static_cast<ReduceTensorOp>(descType::ReduceOpId_),
do_dumpout, descType::PropagateNan_,
time_kernel, descType::UseIndex_>(do_verification,
inLengths, init_method,
reduceDims, do_dumpout,
alpha, time_kernel,
beta); inLengths,
arrReduceDims,
alpha,
beta);
matched = true; matched = true;
}); });
......
...@@ -3,55 +3,27 @@ ...@@ -3,55 +3,27 @@
#pragma once #pragma once
#include <algorithm>
#include <iomanip> #include <iomanip>
#include <iostream>
#include <string>
#include <vector>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp" #include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/data_type.hpp" #include "ck/utility/data_type.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
namespace {
using F16 = ck::half_t;
using F32 = float;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
} // namespace
void add_device_softmax_f16_f16_rank3_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>&);
void add_device_softmax_f16_f16_rank4_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>&);
void add_device_softmax_f32_f32_rank3_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>&);
void add_device_softmax_f32_f32_rank4_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>&);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace ck { namespace ck {
namespace profiler { namespace profiler {
enum struct NormType enum struct SoftmaxDataType
{
BATCHNORM,
SOFTMAX,
};
enum struct NormDataType
{ {
F32_F32, // in, out F32_F32, // in, out
F16_F16, F16_F16,
...@@ -60,7 +32,7 @@ enum struct NormDataType ...@@ -60,7 +32,7 @@ enum struct NormDataType
}; };
// clang-format off // clang-format off
template <typename NormDataType> std::string type_to_string(); template <typename SoftmaxDataType> std::string type_to_string();
template <> std::string type_to_string<float>() { return "f32"; } template <> std::string type_to_string<float>() { return "f32"; }
template <> std::string type_to_string<half_t>() { return "f16"; } template <> std::string type_to_string<half_t>() { return "f16"; }
template <> std::string type_to_string<bhalf_t>() { return "bf16"; } template <> std::string type_to_string<bhalf_t>() { return "bf16"; }
...@@ -69,7 +41,7 @@ template <> std::string type_to_string<int32_t>() { return "int32"; } ...@@ -69,7 +41,7 @@ template <> std::string type_to_string<int32_t>() { return "int32"; }
// clang-format on // clang-format on
template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank> template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
void profile_softmax_impl(int do_verification, bool profile_softmax_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
bool time_kernel, bool time_kernel,
...@@ -77,8 +49,7 @@ void profile_softmax_impl(int do_verification, ...@@ -77,8 +49,7 @@ void profile_softmax_impl(int do_verification,
std::vector<index_t> in_strides, std::vector<index_t> in_strides,
std::vector<index_t> reduce_dims, std::vector<index_t> reduce_dims,
AccDataType alpha, AccDataType alpha,
AccDataType beta, AccDataType beta)
NormType norm_type)
{ {
if(Rank != in_length.size()) if(Rank != in_length.size())
{ {
...@@ -88,62 +59,46 @@ void profile_softmax_impl(int do_verification, ...@@ -88,62 +59,46 @@ void profile_softmax_impl(int do_verification,
Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length) Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
: Tensor<InDataType>(in_length, in_strides); : Tensor<InDataType>(in_length, in_strides);
Tensor<OutDataType> out(in.mDesc); Tensor<OutDataType> out(in.mDesc);
Tensor<OutDataType> prior_out(in.mDesc);
switch(init_method) switch(init_method)
{ {
// case 0: break; case 0: break;
case 0:
in.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
out.GenerateTensorValue(GeneratorTensor_1<OutDataType>{});
break;
case 1: case 1:
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); ck::utils::FillUniformDistributionIntegerValue<InDataType>{-5.f, 5.f}(in.begin(), in.end());
out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5}); ck::utils::FillUniformDistributionIntegerValue<OutDataType>{-5.f, 5.f}(prior_out.begin(),
prior_out.end());
break; break;
default: default:
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0}); ck::utils::FillUniformDistribution<InDataType>{0.0f, 1.0f}(in);
out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5}); ck::utils::FillUniformDistribution<OutDataType>{-0.5f, 0.5f}(prior_out);
} }
Tensor<OutDataType> out_ref(out); Tensor<OutDataType> out_ref(prior_out);
if(do_verification)
{
using ReferenceSoftmax =
tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
ReferenceSoftmax{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});
}
DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); DeviceMem in_dev(in.GetElementSpaceSizeInBytes());
DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize()); DeviceMem out_dev(out.GetElementSpaceSizeInBytes());
in_dev.ToDevice(in.mData.data()); in_dev.ToDevice(in.data());
out_dev.ToDevice(out.mData.data());
std::vector<index_t> i_in_lengths(in.mDesc.GetLengths().begin(), in.mDesc.GetLengths().end()); std::vector<index_t> in_tensor_lengths(in.GetLengths().begin(), in.GetLengths().end());
std::vector<index_t> i_in_strides(in.mDesc.GetStrides().begin(), in.mDesc.GetStrides().end()); std::vector<index_t> in_tensor_strides(in.GetStrides().begin(), in.GetStrides().end());
// add device softmax instances // add device softmax instances
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceOpPtr = tensor_operation::device:: using DeviceOp = tensor_operation::device::
DeviceSoftmaxPtr<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>; DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
std::vector<DeviceOpPtr> instances;
if(norm_type == NormType::SOFTMAX) // get device op instances
{ const auto instances = tensor_operation::device::instance::DeviceOperationInstanceFactory<
if constexpr(is_same<InDataType, half_t>::value && is_same<OutDataType, half_t>::value && DeviceOp>::GetInstances();
is_same<AccDataType, float>::value) std::cout << "found " << instances.size() << " instances" << std::endl;
{
if constexpr(Rank == 3)
tensor_operation::device::instance::add_device_softmax_f16_f16_rank3_instances(
instances);
else if constexpr(Rank == 4)
tensor_operation::device::instance::add_device_softmax_f16_f16_rank4_instances(
instances);
}
else if constexpr(is_same<InDataType, float>::value && is_same<OutDataType, float>::value &&
is_same<AccDataType, float>::value)
{
if constexpr(Rank == 3)
tensor_operation::device::instance::add_device_softmax_f32_f32_rank3_instances(
instances);
else if constexpr(Rank == 4)
tensor_operation::device::instance::add_device_softmax_f32_f32_rank4_instances(
instances);
}
}
if(instances.size() <= 0) if(instances.size() <= 0)
{ {
...@@ -153,21 +108,19 @@ void profile_softmax_impl(int do_verification, ...@@ -153,21 +108,19 @@ void profile_softmax_impl(int do_verification,
std::string best_instance_name; std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max(); float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
std::vector<bool> instance_pass;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
for(auto& inst_ptr : instances) for(auto& inst_ptr : instances)
{ {
// Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3 // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
// problem to rank 4 kernel) other than invoking IsSupportedArgument()? // problem to rank 4 kernel) other than invoking IsSupportedArgument()?
if(!(inst_ptr->GetRank() == static_cast<index_t>(i_in_lengths.size()) && if(!(inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
{ {
continue; continue;
} }
auto argument_ptr = inst_ptr->MakeArgumentPointer(i_in_lengths, auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths,
i_in_strides, in_tensor_strides,
reduce_dims, reduce_dims,
&alpha, &alpha,
&beta, &beta,
...@@ -181,45 +134,42 @@ void profile_softmax_impl(int do_verification, ...@@ -181,45 +134,42 @@ void profile_softmax_impl(int do_verification,
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: "; std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
LogRange(std::cout << "input lengths = [", in_length, ", ") LogRange(std::cout << "input lengths = [", in_length, ", ")
<< "], " << "], "
<< "scaler = [" << alpha << ", " << beta << "]." << std::endl; << "scaler = [" << alpha << ", " << beta << "]";
return; LogRange(std::cout << ", reduce dims = [", reduce_dims, ", ") << "]." << std::endl;
instance_pass.push_back(true);
continue;
} }
out_dev.ToDevice(prior_out.data());
auto invoker_ptr = inst_ptr->MakeInvokerPointer(); auto invoker_ptr = inst_ptr->MakeInvokerPointer();
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); if(time_kernel)
{
std::size_t num_bytes = std::size_t num_bytes =
in.mDesc.GetElementSize() * sizeof(InDataType) + in.GetElementSize() * sizeof(InDataType) +
(beta == 0.0f ? 1 : 2) * out.mDesc.GetElementSize() * sizeof(OutDataType); (beta == 0.0f ? 1 : 2) * out.GetElementSize() * sizeof(OutDataType);
float gb_per_sec = num_bytes / 1.E6 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< inst_ptr->GetTypeString() << std::endl; << inst_ptr->GetTypeString() << std::endl;
if(avg_time < best_avg_time) if(avg_time < best_avg_time)
{ {
best_instance_name = inst_ptr->GetTypeString(); best_instance_name = inst_ptr->GetTypeString();
best_avg_time = avg_time; best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec; best_gb_per_sec = gb_per_sec;
}
} }
if(do_verification) if(do_verification)
{ {
// TODO: factory method to dynamically switch between different reference normalizations out_dev.FromDevice(out.data());
using ReferenceFactory = bool pass = true;
tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
ReferenceFactory{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});
out_dev.FromDevice(out.mData.data());
bool pass;
if(std::is_same<InDataType, int8_t>::value) if(std::is_same<InDataType, int8_t>::value)
{ {
pass = ck::utils::check_err( pass = pass && ck::utils::check_err(
out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1); out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1);
if(do_log) if(do_log)
{ {
LogRangeAsType<int>(std::cout << "in : ", in.mData, ",") << std::endl; LogRangeAsType<int>(std::cout << "in : ", in.mData, ",") << std::endl;
...@@ -230,7 +180,7 @@ void profile_softmax_impl(int do_verification, ...@@ -230,7 +180,7 @@ void profile_softmax_impl(int do_verification,
} }
else else
{ {
pass = ck::utils::check_err(out.mData, out_ref.mData); pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
...@@ -247,16 +197,22 @@ void profile_softmax_impl(int do_verification, ...@@ -247,16 +197,22 @@ void profile_softmax_impl(int do_verification,
<< "], " << "], "
<< "scaler = [" << alpha << ", " << beta << "]." << std::endl; << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
} }
instance_pass.push_back(pass);
} }
} }
std::cout << "Best Perf for datatype = " << type_to_string<InDataType>() << "_" if(time_kernel)
<< type_to_string<OutDataType>() << ", "; {
LogRange(std::cout << "length = ", i_in_lengths, ",") << ", "; std::cout << "Best Perf for datatype = " << type_to_string<InDataType>() << "_"
LogRange(std::cout << "stride = ", i_in_strides, ",") << ", "; << type_to_string<OutDataType>() << ", ";
LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", "; LogRange(std::cout << "length = ", in_tensor_lengths, ",") << ", ";
std::cout << "alpha = " << alpha << ", " LogRange(std::cout << "stride = ", in_tensor_strides, ",") << ", ";
<< "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", ";
<< " GB/s, " << best_instance_name << std::endl; std::cout << "alpha = " << alpha << ", "
<< "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec
<< " GB/s, " << best_instance_name << std::endl;
}
return std::all_of(
std::begin(instance_pass), std::end(instance_pass), [](bool p) { return p; });
} }
} // namespace profiler } // namespace profiler
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <initializer_list>
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_conv_bwd_weight_impl.hpp" #include "profiler/include/profile_grouped_conv_bwd_weight_impl.hpp"
namespace { namespace {
enum struct ConvLayout enum struct ConvLayout
{ {
NCHW_KCYX_NKHW, // 0 GNCHW_GKCYX_GNKHW, // 0
NHWC_KYXC_NHWK, // 1 GNHWC_GKYXC_GNHWK, // 1
}; };
enum struct ConvDataType enum struct ConvDataType
...@@ -25,24 +25,25 @@ enum struct ConvDataType ...@@ -25,24 +25,25 @@ enum struct ConvDataType
static void print_helper_msg() static void print_helper_msg()
{ {
std::cout std::cout << "arg1: tensor operation (conv_bwd_weight: Convolution Backward Weight\n"
<< "arg1: tensor operation (conv_bwd_weight: Convolution Backward Weight\n" << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n" << " 1: Input fp16, Weight fp16, Output fp16\n"
<< " 1: Input fp16, Weight fp16, Output fp16\n" << " 2: Input bf16, Weight fp32, Output bf16)\n"
<< " 2: Input bf16, Weight fp32, Output bf16)\n" << "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, "
<< "arg3: tensor layout (0: Input[N, C, Hi, Wi], Weight[K, C, Y, X], Output[N, K, Ho, Wo]\n" "N, K, Ho, Wo]\n"
<< " 1: Input[N, Hi, Wi, C], Weight[K, Y, X, C], Output[N, Ho, Wo, K]\n" << " 1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, "
<< "arg4: verification (0: no, 1: yes)\n" "N, Ho, Wo, K]\n"
<< "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n" << "arg4: verification (0: no, 1: yes)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n" << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
<< "arg7: time kernel (0: no, 1: yes)\n" << "arg6: print tensor value (0: no; 1: yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << " SplitK\n" << "arg7: time kernel (0: no, 1: yes)\n"
<< std::endl; << ck::utils::conv::get_conv_param_parser_helper_msg() << " SplitK\n"
<< std::endl;
} }
} // namespace } // namespace
int profile_conv_bwd_weight(int argc, char* argv[]) int profile_grouped_conv_bwd_weight(int argc, char* argv[])
{ {
// 8 for control, 1 for num_dim_spatial // 8 for control, 1 for num_dim_spatial
if(argc < 9) if(argc < 9)
...@@ -75,17 +76,17 @@ int profile_conv_bwd_weight(int argc, char* argv[]) ...@@ -75,17 +76,17 @@ int profile_conv_bwd_weight(int argc, char* argv[])
using F16 = ck::half_t; using F16 = ck::half_t;
using BF16 = ck::bhalf_t; using BF16 = ck::bhalf_t;
using NWC = ck::tensor_layout::convolution::NWC; using GNWC = ck::tensor_layout::convolution::GNWC;
using NHWC = ck::tensor_layout::convolution::NHWC; using GNHWC = ck::tensor_layout::convolution::GNHWC;
using NDHWC = ck::tensor_layout::convolution::NDHWC; using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
using KXC = ck::tensor_layout::convolution::KXC; using GKXC = ck::tensor_layout::convolution::GKXC;
using KYXC = ck::tensor_layout::convolution::KYXC; using GKYXC = ck::tensor_layout::convolution::GKYXC;
using KZYXC = ck::tensor_layout::convolution::KZYXC; using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
using NWK = ck::tensor_layout::convolution::NWK; using GNWK = ck::tensor_layout::convolution::GNWK;
using NHWK = ck::tensor_layout::convolution::NHWK; using GNHWK = ck::tensor_layout::convolution::GNHWK;
using NDHWK = ck::tensor_layout::convolution::NDHWK; using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
constexpr auto I1 = ck::Number<1>{}; constexpr auto I1 = ck::Number<1>{};
constexpr auto I2 = ck::Number<2>{}; constexpr auto I2 = ck::Number<2>{};
...@@ -108,64 +109,64 @@ int profile_conv_bwd_weight(int argc, char* argv[]) ...@@ -108,64 +109,64 @@ int profile_conv_bwd_weight(int argc, char* argv[])
using WeiDataType = decltype(wei_type); using WeiDataType = decltype(wei_type);
using OutDataType = decltype(out_type); using OutDataType = decltype(out_type);
bool pass = ck::profiler::profile_conv_bwd_weight_impl<NDimSpatial, bool pass = ck::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
OutLayout, OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType>( OutDataType>(
do_verification, init_method, do_log, time_kernel, params, split_k); do_verification, init_method, do_log, time_kernel, params, split_k);
return pass ? 0 : 1; return pass ? 0 : 1;
}; };
if(num_dim_spatial == 1 && layout == ConvLayout::NHWC_KYXC_NHWK) if(num_dim_spatial == 1 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
{ {
if(data_type == ConvDataType::F32_F32_F32) if(data_type == ConvDataType::F32_F32_F32)
{ {
return profile(I1, NWC{}, KXC{}, NWK{}, F32{}, F32{}, F32{}); return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{});
} }
else if(data_type == ConvDataType::F16_F16_F16) else if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I1, NWC{}, KXC{}, NWK{}, F16{}, F16{}, F16{}); return profile(I1, GNWC{}, GKXC{}, GNWK{}, F16{}, F16{}, F16{});
} }
else if(data_type == ConvDataType::BF16_F32_BF16) else if(data_type == ConvDataType::BF16_F32_BF16)
{ {
// fp32 atomic add is used for weight tensor in bf16 kernel // fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I1, NWC{}, KXC{}, NWK{}, BF16{}, F32{}, BF16{}); return profile(I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, F32{}, BF16{});
} }
} }
else if(num_dim_spatial == 2 && layout == ConvLayout::NHWC_KYXC_NHWK) else if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
{ {
if(data_type == ConvDataType::F32_F32_F32) if(data_type == ConvDataType::F32_F32_F32)
{ {
return profile(I2, NHWC{}, KYXC{}, NHWK{}, F32{}, F32{}, F32{}); return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{});
} }
else if(data_type == ConvDataType::F16_F16_F16) else if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I2, NHWC{}, KYXC{}, NHWK{}, F16{}, F16{}, F16{}); return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{});
} }
else if(data_type == ConvDataType::BF16_F32_BF16) else if(data_type == ConvDataType::BF16_F32_BF16)
{ {
// fp32 atomic add is used for weight tensor in bf16 kernel // fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I2, NHWC{}, KYXC{}, NHWK{}, BF16{}, F32{}, BF16{}); return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, F32{}, BF16{});
} }
} }
else if(num_dim_spatial == 3 && layout == ConvLayout::NHWC_KYXC_NHWK) else if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
{ {
if(data_type == ConvDataType::F32_F32_F32) if(data_type == ConvDataType::F32_F32_F32)
{ {
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F32{}, F32{}, F32{}); return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{});
} }
else if(data_type == ConvDataType::F16_F16_F16) else if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F16{}, F16{}, F16{}); return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{});
} }
else if(data_type == ConvDataType::BF16_F32_BF16) else if(data_type == ConvDataType::BF16_F32_BF16)
{ {
// fp32 atomic add is used for weight tensor in bf16 kernel // fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, BF16{}, F32{}, BF16{}); return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{});
} }
} }
......
...@@ -8,14 +8,10 @@ ...@@ -8,14 +8,10 @@
#include "profiler/include/profile_softmax_impl.hpp" #include "profiler/include/profile_softmax_impl.hpp"
using ck::index_t; using ck::index_t;
using ck::profiler::NormDataType; using ck::profiler::SoftmaxDataType;
using ck::profiler::NormType;
struct ArgParser struct ArgParser
{ {
std::unordered_map<std::string, NormType> norm_dict = {{"batchnorm", NormType::BATCHNORM},
{"softmax", NormType::SOFTMAX}};
std::unordered_map<std::string, std::vector<int>> long_opts = { std::unordered_map<std::string, std::vector<int>> long_opts = {
{"length", {}}, {"stride", {}}, {"reduce", {}}, {"alpha", {}}, {"beta", {}}}; {"length", {}}, {"stride", {}}, {"reduce", {}}, {"alpha", {}}, {"beta", {}}};
...@@ -50,7 +46,7 @@ struct ArgParser ...@@ -50,7 +46,7 @@ struct ArgParser
void print_help() void print_help()
{ {
std::cout << "arg1: tensor operation (batchnorm/softmax)\n" std::cout << "arg1: tensor operation (softmax)\n"
<< "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n" << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
<< "arg3: verification (0: no; 1: yes)\n" << "arg3: verification (0: no; 1: yes)\n"
<< "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n" << "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"
...@@ -64,7 +60,7 @@ void print_help() ...@@ -64,7 +60,7 @@ void print_help()
<< std::endl; << std::endl;
} }
int profile_normalization(int argc, char* argv[]) int profile_softmax(int argc, char* argv[])
{ {
if(argc <= 2) if(argc <= 2)
{ {
...@@ -75,12 +71,11 @@ int profile_normalization(int argc, char* argv[]) ...@@ -75,12 +71,11 @@ int profile_normalization(int argc, char* argv[])
ArgParser arg_parser; ArgParser arg_parser;
// short unnamed options // short unnamed options
const NormType norm_type = arg_parser.norm_dict[argv[1]]; const SoftmaxDataType data_type = static_cast<SoftmaxDataType>(std::stoi(argv[2]));
const NormDataType data_type = static_cast<NormDataType>(std::stoi(argv[2])); const bool do_verification = std::stoi(argv[3]);
const bool do_verification = std::stoi(argv[3]); const int init_method = std::stoi(argv[4]);
const int init_method = std::stoi(argv[4]); const bool do_log = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[5]); const bool time_kernel = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[6]);
// parse the long options // parse the long options
arg_parser(argc, argv); arg_parser(argc, argv);
...@@ -91,9 +86,10 @@ int profile_normalization(int argc, char* argv[]) ...@@ -91,9 +86,10 @@ int profile_normalization(int argc, char* argv[])
arg_parser.long_opts["alpha"].empty() ? 1 : arg_parser.long_opts["alpha"][0]; arg_parser.long_opts["alpha"].empty() ? 1 : arg_parser.long_opts["alpha"][0];
const index_t beta = arg_parser.long_opts["beta"].empty() ? 0 : arg_parser.long_opts["beta"][0]; const index_t beta = arg_parser.long_opts["beta"].empty() ? 0 : arg_parser.long_opts["beta"][0];
// Rank 3
if(length.size() == 3) if(length.size() == 3)
{ {
if(data_type == NormDataType::F16_F16) if(data_type == SoftmaxDataType::F16_F16)
{ {
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3>(do_verification, ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3>(do_verification,
init_method, init_method,
...@@ -103,10 +99,9 @@ int profile_normalization(int argc, char* argv[]) ...@@ -103,10 +99,9 @@ int profile_normalization(int argc, char* argv[])
stride, stride,
reduce, reduce,
float(alpha), float(alpha),
float(beta), float(beta));
norm_type);
} }
else if(data_type == NormDataType::F32_F32) else if(data_type == SoftmaxDataType::F32_F32)
{ {
ck::profiler::profile_softmax_impl<float, float, float, 3>(do_verification, ck::profiler::profile_softmax_impl<float, float, float, 3>(do_verification,
init_method, init_method,
...@@ -116,17 +111,17 @@ int profile_normalization(int argc, char* argv[]) ...@@ -116,17 +111,17 @@ int profile_normalization(int argc, char* argv[])
stride, stride,
reduce, reduce,
float(alpha), float(alpha),
float(beta), float(beta));
norm_type);
} }
else else
{ {
throw std::runtime_error("not implemented yet"); throw std::runtime_error("not implemented yet");
} }
} }
// Rank 4
else if(length.size() == 4) else if(length.size() == 4)
{ {
if(data_type == NormDataType::F16_F16) if(data_type == SoftmaxDataType::F16_F16)
{ {
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4>(do_verification, ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4>(do_verification,
init_method, init_method,
...@@ -136,10 +131,9 @@ int profile_normalization(int argc, char* argv[]) ...@@ -136,10 +131,9 @@ int profile_normalization(int argc, char* argv[])
stride, stride,
reduce, reduce,
float(alpha), float(alpha),
float(beta), float(beta));
norm_type);
} }
else if(data_type == NormDataType::F32_F32) else if(data_type == SoftmaxDataType::F32_F32)
{ {
ck::profiler::profile_softmax_impl<float, float, float, 4>(do_verification, ck::profiler::profile_softmax_impl<float, float, float, 4>(do_verification,
init_method, init_method,
...@@ -149,8 +143,7 @@ int profile_normalization(int argc, char* argv[]) ...@@ -149,8 +143,7 @@ int profile_normalization(int argc, char* argv[])
stride, stride,
reduce, reduce,
float(alpha), float(alpha),
float(beta), float(beta));
norm_type);
} }
else else
{ {
......
...@@ -18,9 +18,9 @@ int profile_conv_fwd(int, char*[]); ...@@ -18,9 +18,9 @@ int profile_conv_fwd(int, char*[]);
int profile_conv_fwd_bias_relu(int, char*[]); int profile_conv_fwd_bias_relu(int, char*[]);
int profile_conv_fwd_bias_relu_add(int, char*[]); int profile_conv_fwd_bias_relu_add(int, char*[]);
int profile_conv_bwd_data(int, char*[]); int profile_conv_bwd_data(int, char*[]);
int profile_conv_bwd_weight(int, char*[]);
int profile_grouped_conv_fwd(int, char*[]); int profile_grouped_conv_fwd(int, char*[]);
int profile_normalization(int, char*[]); int profile_grouped_conv_bwd_weight(int, char*[]);
int profile_softmax(int, char*[]);
int profile_layernorm(int, char*[]); int profile_layernorm(int, char*[]);
int profile_groupnorm(int, char*[]); int profile_groupnorm(int, char*[]);
int profile_reduce(int, char*[]); int profile_reduce(int, char*[]);
...@@ -43,8 +43,9 @@ static void print_helper_message() ...@@ -43,8 +43,9 @@ static void print_helper_message()
" conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n" " conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
" conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n" " conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
" conv_bwd_data: Convolution Backward Data\n" " conv_bwd_data: Convolution Backward Data\n"
" conv_bwd_weight: Convolution Backward Weight\n"
" grouped_conv_fwd: Grouped Convolution Forward\n" " grouped_conv_fwd: Grouped Convolution Forward\n"
" grouped_conv_bwd_weight: Grouped Convolution Backward Weight\n"
" softmax: Softmax\n"
" reduce: Reduce\n"); " reduce: Reduce\n");
// clang-format on // clang-format on
} }
...@@ -117,21 +118,21 @@ int main(int argc, char* argv[]) ...@@ -117,21 +118,21 @@ int main(int argc, char* argv[])
{ {
return profile_conv_bwd_data(argc, argv); return profile_conv_bwd_data(argc, argv);
} }
else if(strcmp(argv[1], "conv_bwd_weight") == 0)
{
return profile_conv_bwd_weight(argc, argv);
}
else if(strcmp(argv[1], "grouped_conv_fwd") == 0) else if(strcmp(argv[1], "grouped_conv_fwd") == 0)
{ {
return profile_grouped_conv_fwd(argc, argv); return profile_grouped_conv_fwd(argc, argv);
} }
else if(strcmp(argv[1], "conv_bwd_weight") == 0)
{
return profile_grouped_conv_bwd_weight(argc, argv);
}
else if(strcmp(argv[1], "reduce") == 0) else if(strcmp(argv[1], "reduce") == 0)
{ {
return profile_reduce(argc, argv); return profile_reduce(argc, argv);
} }
else if(strcmp(argv[1], "batchnorm") == 0 || strcmp(argv[1], "softmax") == 0) else if(strcmp(argv[1], "softmax") == 0)
{ {
return profile_normalization(argc, argv); return profile_softmax(argc, argv);
} }
else if(strcmp(argv[1], "layernorm") == 0) else if(strcmp(argv[1], "layernorm") == 0)
{ {
......
...@@ -11,7 +11,7 @@ cmake ...@@ -11,7 +11,7 @@ cmake
-D CMAKE_CXX_FLAGS="-O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD" \ -D CMAKE_CXX_FLAGS="-O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD" \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D BUILD_DEV=ON \ -D BUILD_DEV=ON \
-D GPU_TARGETS=gfx908;gfx90a \ -D GPU_TARGETS="gfx908;gfx90a" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D USE_BITINT_EXTENSION_INT4=OFF \ -D USE_BITINT_EXTENSION_INT4=OFF \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
......
...@@ -11,7 +11,7 @@ cmake ...@@ -11,7 +11,7 @@ cmake
-D CMAKE_CXX_FLAGS="-O3" \ -D CMAKE_CXX_FLAGS="-O3" \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D BUILD_DEV=OFF \ -D BUILD_DEV=OFF \
-D GPU_TARGETS=gfx908;gfx90a \ -D GPU_TARGETS="gfx908;gfx90a" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D USE_BITINT_EXTENSION_INT4=OFF \ -D USE_BITINT_EXTENSION_INT4=OFF \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment