Commit 05ee41c3 authored by Rosty Geyyer's avatar Rosty Geyyer
Browse files

Merge branch 'develop' into lwpck-471

parents 37116c98 ad541ad6
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp"
namespace ck { namespace ck {
...@@ -68,19 +69,19 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ...@@ -68,19 +69,19 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) { [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
using namespace ck::literals;
if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value || if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value || is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value) is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
} }
else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value || else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
is_same<decltype(layout), tensor_layout::convolution::KYXC>::value || is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
is_same<decltype(layout), tensor_layout::convolution::NHWK>::value) is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
} }
}; };
...@@ -92,8 +93,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ...@@ -92,8 +93,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
// bias: assume contiguous 1d vector // bias: assume contiguous 1d vector
Tensor<OutDataType> bias_k( Tensor<OutDataType> bias_k({K});
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
// residual: assume same layout as output tensor // residual: assume same layout as output tensor
Tensor<OutDataType> resi_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); Tensor<OutDataType> resi_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
...@@ -251,8 +251,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ...@@ -251,8 +251,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
{ {
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
ck::utils::check_err(out_n_k_ho_wo_device_result.mData, ck::utils::check_err(out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result);
out_n_k_ho_wo_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
namespace ck { namespace ck {
...@@ -68,19 +69,19 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ...@@ -68,19 +69,19 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) { [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
using namespace ck::literals;
if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value || if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value || is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value) is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
} }
else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value || else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
is_same<decltype(layout), tensor_layout::convolution::KYXC>::value || is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
is_same<decltype(layout), tensor_layout::convolution::NHWK>::value) is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
} }
}; };
...@@ -92,8 +93,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ...@@ -92,8 +93,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
// bias: assume contiguous 1d vector // bias: assume contiguous 1d vector
Tensor<OutDataType> bias_k( Tensor<OutDataType> bias_k({K});
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl; std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl; std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
...@@ -239,8 +239,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ...@@ -239,8 +239,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
{ {
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
ck::utils::check_err(out_n_k_ho_wo_device_result.mData, ck::utils::check_err(out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result);
out_n_k_ho_wo_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -191,7 +191,7 @@ bool profile_conv_fwd_impl(int do_verification, ...@@ -191,7 +191,7 @@ bool profile_conv_fwd_impl(int do_verification,
{ {
out_device_buf.FromDevice(device_output.mData.data()); out_device_buf.FromDevice(device_output.mData.data());
pass = pass & ck::utils::check_err(device_output.mData, host_output.mData); pass = pass & ck::utils::check_err(device_output, host_output);
if(do_log) if(do_log)
{ {
......
...@@ -453,7 +453,7 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -453,7 +453,7 @@ bool profile_convnd_bwd_data_impl(int do_verification,
std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl; std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
} }
success = ck::utils::check_err(input_host_result.mData, input_device_result.mData); success = ck::utils::check_err(input_host_result, input_device_result);
if(do_log) if(do_log)
{ {
......
...@@ -433,7 +433,7 @@ bool profile_convnd_bwd_weight_impl(int do_verification, ...@@ -433,7 +433,7 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
{ {
wei_device_buf.FromDevice(weights_device_result.mData.data()); wei_device_buf.FromDevice(weights_device_result.mData.data());
success = ck::utils::check_err(weights_host_result.mData, weights_device_result.mData); success = ck::utils::check_err(weights_host_result, weights_device_result);
if(success == false) if(success == false)
{ {
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
namespace ck {
namespace profiler {
template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
void host_elementwise2D(HostTensorC& C,
const HostTensorA& A,
const HostTensorB& B,
const std::vector<std::size_t>& shape,
Functor functor)
{
using ctype = ck::remove_reference_t<decltype(C(0, 0))>;
for(std::size_t m = 0; m < shape[0]; ++m)
for(std::size_t n = 0; n < shape[1]; ++n)
{
auto a_val = A(m, n);
auto b_val = B(m, n);
ctype c_val = 0;
functor(c_val, a_val, b_val);
C(m, n) = c_val;
}
}
template <typename ADataType,
typename BDataType,
typename GammaDataType,
typename BetaDataType,
typename AccDataType,
typename YDataType>
bool profile_elementwise_layernorm_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
std::vector<index_t> length)
{
using Add = ck::tensor_operation::element_wise::Add;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
if(length.size() != 2)
return false;
index_t M = length[0];
index_t N = length[1];
index_t Stride = N;
constexpr int Rank = 2;
constexpr int NumReduceDim = 1;
std::vector<index_t> reduce_dim = {1};
std::vector<index_t> gammaBetaLength = {N};
std::vector<index_t> gammaBetaStride = {0, 1};
auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) {
using namespace ck::literals;
return HostTensorDescriptor({row, col}, {stride, 1_uz});
};
Tensor<ADataType> a(length);
Tensor<BDataType> b(length);
Tensor<GammaDataType> gamma(gammaBetaLength);
Tensor<BetaDataType> beta(gammaBetaLength);
Tensor<YDataType> y(length);
Tensor<YDataType> host_y(length);
switch(init_method)
{
case 0:
a.GenerateTensorValue(GeneratorTensor_1<ADataType>{});
b.GenerateTensorValue(GeneratorTensor_1<BDataType>{});
gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{});
beta.GenerateTensorValue(GeneratorTensor_1<BetaDataType>{});
break;
case 1:
a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
gamma.GenerateTensorValue(GeneratorTensor_2<GammaDataType>{-5, 5});
beta.GenerateTensorValue(GeneratorTensor_2<BetaDataType>{-5, 5});
break;
default:
a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0, 1});
b.GenerateTensorValue(GeneratorTensor_3<BDataType>{0, 1});
gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{-0.5, 0.5});
beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-0.5, 0.5});
}
DeviceMem a_dev(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
DeviceMem b_dev(sizeof(ADataType) * b.mDesc.GetElementSpaceSize());
DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
a_dev.ToDevice(a.mData.data());
b_dev.ToDevice(b.mData.data());
gamma_dev.ToDevice(gamma.mData.data());
beta_dev.ToDevice(beta.mData.data());
std::array<const void*, 2> input = {a_dev.GetDeviceBuffer(), b_dev.GetDeviceBuffer()};
// add device normalization instances
using DeviceOp = ck::tensor_operation::device::DeviceElementwiseNormalization<
ck::Tuple<ADataType, BDataType>,
GammaDataType,
BetaDataType,
AccDataType,
YDataType,
Add,
PassThrough,
2,
1>;
// get device op instances
const auto instance_ptrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
if(do_verification)
{
using XDataType = ADataType;
std::vector<std::size_t> mn = {static_cast<unsigned long>(M),
static_cast<unsigned long>(N)};
Tensor<XDataType> x(f_host_tensor_descriptor2d(M, N, Stride));
host_elementwise2D<Tensor<ADataType>, Tensor<BDataType>, Tensor<XDataType>, Add>(
x, a, b, mn, Add{});
using ReferenceInstance = ck::tensor_operation::host::ReferenceLayernorm<XDataType,
GammaDataType,
BetaDataType,
YDataType,
AccDataType,
PassThrough,
Rank,
NumReduceDim>;
ReferenceInstance ref;
auto ref_argument =
ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, {M, N}, {1}, 1e-4);
auto ref_invoker = ref.MakeInvoker();
ref_invoker.Run(ref_argument);
}
int num_kernel = 0;
for(auto& inst_ptr : instance_ptrs)
{
auto argument_ptr = inst_ptr->MakeArgumentPointer(
length,
{
std::vector<ck::index_t>{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()},
std::vector<ck::index_t>{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()},
},
gammaBetaStride,
gammaBetaStride,
std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
reduce_dim,
1e-4,
input,
gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(),
Add{},
PassThrough{});
if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
{
++num_kernel;
}
else
{
continue;
}
auto invoker_ptr = inst_ptr->MakeInvokerPointer();
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_bytes = a.mDesc.GetElementSize() * sizeof(ADataType) +
b.mDesc.GetElementSize() * sizeof(BDataType) +
gamma.mDesc.GetElementSize() * sizeof(GammaDataType) +
beta.mDesc.GetElementSize() * sizeof(BetaDataType) +
y.mDesc.GetElementSize() * sizeof(YDataType);
float gb_per_sec = num_bytes / 1.E6 / avg_time;
if(time_kernel)
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< inst_ptr->GetTypeString() << std::endl;
if(avg_time < best_avg_time)
{
best_instance_name = inst_ptr->GetTypeString();
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
{
y_dev.FromDevice(y.mData.data());
bool pass =
ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b : ", b.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_y : ", host_y.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "y : ", y.mData, ",") << std::endl;
}
if(!pass)
{
std::cout << inst_ptr->GetTypeString() << " failed verification: ";
LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
return false;
}
else
{
if(time_kernel)
std::cout << "pass" << std::endl;
}
}
}
if(time_kernel)
{
LogRange(std::cout << "length = ", length, ",") << ", ";
std::cout << "num_kernel = " << num_kernel << ", best perf = " << best_avg_time << " ms, "
<< best_gb_per_sec << " GB/s, " << best_instance_name << std::endl;
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is tested" << std::endl;
return false;
}
return true;
}
} // namespace profiler
} // namespace ck
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -47,15 +48,15 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -47,15 +48,15 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
{ {
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -121,8 +122,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -121,8 +122,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
// run reference // run reference
if(do_verification) if(do_verification)
{ {
Tensor<AccDataType> c_m_n(HostTensorDescriptor( Tensor<AccDataType> c_m_n({M, N});
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
...@@ -223,8 +223,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -223,8 +223,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
{ {
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.mData.data());
pass = pass && pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
} }
} }
else else
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -75,21 +76,20 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -75,21 +76,20 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
int StrideD0) int StrideD0)
{ {
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
return HostTensorDescriptor(std::vector<std::size_t>({len}), return HostTensorDescriptor({len}, {stride});
std::vector<std::size_t>({stride}));
}; };
auto f_host_tensor_descriptor2d = auto f_host_tensor_descriptor2d =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -99,16 +99,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -99,16 +99,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1)); Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_host_result( Tensor<ReduceDataType> reduce0_m_host_result({M});
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> reduce1_m_host_result({M});
Tensor<ReduceDataType> reduce1_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_device_result( Tensor<ReduceDataType> reduce0_m_device_result({M});
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> reduce1_m_device_result({M});
Tensor<ReduceDataType> reduce1_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
...@@ -347,9 +343,9 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -347,9 +343,9 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data()); reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data()); reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData); ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData); ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
if(do_log) if(do_log)
{ {
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -46,15 +47,15 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -46,15 +47,15 @@ bool profile_gemm_bilinear_impl(int do_verification,
{ {
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -116,8 +117,7 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -116,8 +117,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
// run reference // run reference
if(do_verification) if(do_verification)
{ {
Tensor<AccDataType> c_m_n(HostTensorDescriptor( Tensor<AccDataType> c_m_n({M, N});
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
...@@ -215,8 +215,7 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -215,8 +215,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
{ {
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.mData.data());
pass = pass && pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
} }
} }
else else
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -45,15 +46,15 @@ int profile_gemm_impl(int do_verification, ...@@ -45,15 +46,15 @@ int profile_gemm_impl(int do_verification,
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -187,8 +188,7 @@ int profile_gemm_impl(int do_verification, ...@@ -187,8 +188,7 @@ int profile_gemm_impl(int do_verification,
{ {
c_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass = pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -75,15 +76,15 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -75,15 +76,15 @@ bool profile_gemm_reduce_impl(int do_verification,
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -91,16 +92,12 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -91,16 +92,12 @@ bool profile_gemm_reduce_impl(int do_verification,
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_host_result( Tensor<ReduceDataType> reduce0_m_host_result({M});
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> reduce1_m_host_result({M});
Tensor<ReduceDataType> reduce1_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_device_result( Tensor<ReduceDataType> reduce0_m_device_result({M});
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> reduce1_m_device_result({M});
Tensor<ReduceDataType> reduce1_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
...@@ -313,9 +310,9 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -313,9 +310,9 @@ bool profile_gemm_reduce_impl(int do_verification,
reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data()); reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data()); reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData); ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData); ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
if(do_log) if(do_log)
{ {
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -46,15 +47,15 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -46,15 +47,15 @@ bool profile_gemm_splitk_impl(int do_verification,
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -190,8 +191,7 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -190,8 +191,7 @@ bool profile_gemm_splitk_impl(int do_verification,
{ {
c_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass = pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -3,9 +3,10 @@ ...@@ -3,9 +3,10 @@
#pragma once #pragma once
#include "ck/ck.hpp" #include <algorithm>
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
#include <iterator>
#include <typeinfo> #include <typeinfo>
#include "ck/ck.hpp" #include "ck/ck.hpp"
...@@ -13,7 +14,7 @@ ...@@ -13,7 +14,7 @@
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/convolution_backward_weight.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -26,32 +27,6 @@ ...@@ -26,32 +27,6 @@
namespace ck { namespace ck {
namespace profiler { namespace profiler {
template <typename DataType>
void show_data_nhwc_layout(Tensor<DataType>& nhwc)
{
std::cout << "[";
for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
{
std::cout << "[";
for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
{
std::cout << "[";
for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
{
std::cout << "[";
for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
{
std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << " ";
}
std::cout << "]";
}
std::cout << "]";
}
std::cout << "]";
}
std::cout << "]";
}
template <ck::index_t NDimSpatial, template <ck::index_t NDimSpatial,
typename InLayout, typename InLayout,
typename WeiLayout, typename WeiLayout,
...@@ -59,12 +34,12 @@ template <ck::index_t NDimSpatial, ...@@ -59,12 +34,12 @@ template <ck::index_t NDimSpatial,
typename InDataType, typename InDataType,
typename WeiDataType, typename WeiDataType,
typename OutDataType> typename OutDataType>
bool profile_conv_bwd_weight_impl(int do_verification, bool profile_grouped_conv_bwd_weight_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
bool time_kernel, bool time_kernel,
const ck::utils::conv::ConvParam& conv_param, const ck::utils::conv::ConvParam& conv_param,
ck::index_t split_k) ck::index_t split_k)
{ {
using InElementOp = ck::tensor_operation::element_wise::PassThrough; using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
...@@ -114,16 +89,14 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -114,16 +89,14 @@ bool profile_conv_bwd_weight_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial, auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp>{}; OutElementOp>{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input, auto ref_argument = ref_conv.MakeArgument(input,
weight_host_result, weight_host_result,
output, output,
...@@ -138,16 +111,16 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -138,16 +111,16 @@ bool profile_conv_bwd_weight_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
using DeviceOp = ck::tensor_operation::device::DeviceConvBwdWeight<NDimSpatial, using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
OutLayout, OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp>; OutElementOp>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
...@@ -163,22 +136,41 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -163,22 +136,41 @@ bool profile_conv_bwd_weight_impl(int do_verification,
// profile device Conv instances // profile device Conv instances
bool all_pass = true; bool all_pass = true;
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto range_copy = [](const auto& from, auto to) { std::copy(begin(from), end(from), to); };
range_copy(conv_param.input_spatial_lengths_, begin(input_spatial_lengths));
range_copy(conv_param.filter_spatial_lengths_, begin(filter_spatial_lengths));
range_copy(conv_param.output_spatial_lengths_, begin(output_spatial_lengths));
range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
range_copy(conv_param.input_left_pads_, begin(input_left_pads));
range_copy(conv_param.input_right_pads_, begin(input_right_pads));
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = auto argument_ptr =
op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()), static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
conv_param.G_,
conv_param.N_, conv_param.N_,
conv_param.K_, conv_param.K_,
conv_param.C_, conv_param.C_,
conv_param.input_spatial_lengths_, input_spatial_lengths,
conv_param.filter_spatial_lengths_, filter_spatial_lengths,
conv_param.output_spatial_lengths_, output_spatial_lengths,
conv_param.conv_filter_strides_, conv_filter_strides,
conv_param.conv_filter_dilations_, conv_filter_dilations,
conv_param.input_left_pads_, input_left_pads,
conv_param.input_right_pads_, input_right_pads,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op, out_element_op,
...@@ -217,33 +209,29 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -217,33 +209,29 @@ bool profile_conv_bwd_weight_impl(int do_verification,
{ {
wei_device_buf.FromDevice(weight_device_result.mData.data()); wei_device_buf.FromDevice(weight_device_result.mData.data());
bool pass = bool pass = ck::utils::check_err(weight_device_result, weight_host_result);
ck::utils::check_err(weight_host_result.mData, weight_device_result.mData);
if(!pass) if(!pass)
{ {
std::cout << "Fail info:" << op_ptr->GetTypeString() << std::endl; std::cout << "Fail info: " << op_ptr->GetTypeString() << std::endl;
} }
all_pass &= pass; all_pass &= pass;
if(do_log) if(do_log)
{ {
std::cout << "in : "; LogRangeAsType<float>(std::cout << "output : ", output.mData, ",") << std::endl;
show_data_nhwc_layout(output); ;
std::cout << std::endl; LogRangeAsType<float>(
std::cout << "weight (device): ", weight_device_result.mData, ",")
std::cout << "wei: "; << std::endl;
show_data_nhwc_layout(weight_host_result); ;
std::cout << std::endl; LogRangeAsType<float>(
std::cout << "weight (host): ", weight_host_result.mData, ",")
std::cout << "out : "; << std::endl;
show_data_nhwc_layout(input); ;
std::cout << std::endl; LogRangeAsType<float>(std::cout << "input: ", input.mData, ",") << std::endl;
;
std::cout << "wei_device: ";
show_data_nhwc_layout(weight_device_result);
std::cout << std::endl;
} }
} }
} }
......
...@@ -9,14 +9,12 @@ ...@@ -9,14 +9,12 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
...@@ -69,7 +67,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -69,7 +67,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
std::array<ck::index_t, NDimSpatial> input_left_pads{}; std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{}; std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); }; auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
...@@ -182,7 +180,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -182,7 +180,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
{ {
out_device_buf.FromDevice(device_output.mData.data()); out_device_buf.FromDevice(device_output.mData.data());
pass = pass & ck::utils::check_err(device_output.mData, host_output.mData); pass = pass & ck::utils::check_err(device_output, host_output);
if(do_log) if(do_log)
{ {
...@@ -224,26 +222,25 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -224,26 +222,25 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = op_ptr->MakeArgumentPointer( auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
in_device_buf.GetDeviceBuffer(), wei_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(), {},
std::array<const void*, 0>{}, out_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(), a_g_n_c_wis_lengths,
a_g_n_c_wis_lengths, a_g_n_c_wis_strides,
a_g_n_c_wis_strides, b_g_k_c_xs_lengths,
b_g_k_c_xs_lengths, b_g_k_c_xs_strides,
b_g_k_c_xs_strides, {},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}}, {},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}}, e_g_n_k_wos_lengths,
e_g_n_k_wos_lengths, e_g_n_k_wos_strides,
e_g_n_k_wos_strides, conv_filter_strides,
conv_filter_strides, conv_filter_dilations,
conv_filter_dilations, input_left_pads,
input_left_pads, input_right_pads,
input_right_pads, in_element_op,
in_element_op, wei_element_op,
wei_element_op, out_element_op);
out_element_op);
run_impl(op_ptr, argument_ptr); run_impl(op_ptr, argument_ptr);
} }
...@@ -262,8 +259,10 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -262,8 +259,10 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
WeiElementOp, WeiElementOp,
OutElementOp>; OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances(); DeviceOp>::GetInstances();
std::cout << "dl found " << op_ptrs.size() << " instances" << std::endl; std::cout << "dl found " << op_ptrs.size() << " instances" << std::endl;
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -45,15 +46,15 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -45,15 +46,15 @@ bool profile_grouped_gemm_impl(int do_verification,
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -257,8 +258,7 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -257,8 +258,7 @@ bool profile_grouped_gemm_impl(int do_verification,
c_element_op); c_element_op);
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
pass = pass && ck::utils::check_err(c_m_n_device_results[i].mData, pass = pass && ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);
c_m_n_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -126,6 +126,8 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -126,6 +126,8 @@ bool profile_groupnorm_impl(int do_verification,
gamma_dev.GetDeviceBuffer(), gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(), beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(), y_dev.GetDeviceBuffer(),
nullptr,
nullptr,
PassThrough{}); PassThrough{});
if(inst_ptr->IsSupportedArgument(argument_ptr.get())) if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
...@@ -163,8 +165,7 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -163,8 +165,7 @@ bool profile_groupnorm_impl(int do_verification,
{ {
y_dev.FromDevice(y.mData.data()); y_dev.FromDevice(y.mData.data());
bool pass = bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
if(do_log) if(do_log)
{ {
...@@ -196,7 +197,7 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -196,7 +197,7 @@ bool profile_groupnorm_impl(int do_verification,
if(num_kernel == 0) if(num_kernel == 0)
{ {
std::cout << "Error: No kernel is tested" << std::endl; std::cout << "Error: No kernel is applicable" << std::endl;
return false; return false;
} }
......
...@@ -22,7 +22,7 @@ template <typename XDataType, ...@@ -22,7 +22,7 @@ template <typename XDataType,
typename AccDataType, typename AccDataType,
typename YDataType, typename YDataType,
index_t Rank> index_t Rank>
void profile_layernorm_impl(int do_verification, bool profile_layernorm_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
bool time_kernel, bool time_kernel,
...@@ -31,7 +31,7 @@ void profile_layernorm_impl(int do_verification, ...@@ -31,7 +31,7 @@ void profile_layernorm_impl(int do_verification,
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
if(length.size() < 2) if(length.size() < 2)
return; return false;
// Assume normalize dimension except for batch (first) dimension // Assume normalize dimension except for batch (first) dimension
std::vector<index_t> reduce_length{length.begin() + 1, length.end()}; std::vector<index_t> reduce_length{length.begin() + 1, length.end()};
...@@ -52,7 +52,6 @@ void profile_layernorm_impl(int do_verification, ...@@ -52,7 +52,6 @@ void profile_layernorm_impl(int do_verification,
switch(init_method) switch(init_method)
{ {
// case 0: break;
case 0: case 0:
x.GenerateTensorValue(GeneratorTensor_1<XDataType>{}); x.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{}); gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{});
...@@ -122,6 +121,8 @@ void profile_layernorm_impl(int do_verification, ...@@ -122,6 +121,8 @@ void profile_layernorm_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
int num_kernel = 0;
for(auto& inst_ptr : instance_ptrs) for(auto& inst_ptr : instance_ptrs)
{ {
auto argument_ptr = inst_ptr->MakeArgumentPointer(length, auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
...@@ -135,12 +136,21 @@ void profile_layernorm_impl(int do_verification, ...@@ -135,12 +136,21 @@ void profile_layernorm_impl(int do_verification,
gamma_dev.GetDeviceBuffer(), gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(), beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(), y_dev.GetDeviceBuffer(),
nullptr,
nullptr,
PassThrough{}); PassThrough{});
if(!inst_ptr->IsSupportedArgument(argument_ptr.get())) if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
{
++num_kernel;
}
else
{ {
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: "; if(time_kernel)
LogRange(std::cout << "input lengths = ", length, ", ") << std::endl; {
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
}
continue; continue;
} }
...@@ -156,8 +166,9 @@ void profile_layernorm_impl(int do_verification, ...@@ -156,8 +166,9 @@ void profile_layernorm_impl(int do_verification,
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " if(time_kernel)
<< inst_ptr->GetTypeString() << std::endl; std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< inst_ptr->GetTypeString() << std::endl;
if(avg_time < best_avg_time) if(avg_time < best_avg_time)
{ {
...@@ -184,20 +195,32 @@ void profile_layernorm_impl(int do_verification, ...@@ -184,20 +195,32 @@ void profile_layernorm_impl(int do_verification,
{ {
std::cout << inst_ptr->GetTypeString() << " failed verification: "; std::cout << inst_ptr->GetTypeString() << " failed verification: ";
LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl; LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
return; return false;
} }
else else
{ {
std::cout << "pass" << std::endl; if(time_kernel)
std::cout << "pass" << std::endl;
} }
} }
} }
LogRange(std::cout << "length = ", length, ",") << ", "; if(time_kernel)
LogRange(std::cout << "stride = ", strideXY, ",") << ", "; {
LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl; LogRange(std::cout << "length = ", length, ",") << ", ";
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, " LogRange(std::cout << "stride = ", strideXY, ",") << ", ";
<< best_instance_name << std::endl; LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl;
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_instance_name << std::endl;
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
return true;
} }
} // namespace profiler } // namespace profiler
......
...@@ -6,8 +6,9 @@ ...@@ -6,8 +6,9 @@
#include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp" #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_reduction.hpp" #include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/host_common_util.hpp" #include "ck/library/utility/host_common_util.hpp"
...@@ -359,10 +360,10 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -359,10 +360,10 @@ bool profile_reduce_impl_impl(bool do_verification,
std::array<index_t, NumOutDim> arrOutLengths; std::array<index_t, NumOutDim> arrOutLengths;
std::array<index_t, NumOutDim> arrOutStrides; std::array<index_t, NumOutDim> arrOutStrides;
std::copy(inLengths.begin(), inLengths.end(), arrInLengths.begin()); ck::ranges::copy(inLengths, arrInLengths.begin());
std::copy(inStrides.begin(), inStrides.end(), arrInStrides.begin()); ck::ranges::copy(inStrides, arrInStrides.begin());
std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin()); ck::ranges::copy(outLengths, arrOutLengths.begin());
std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin()); ck::ranges::copy(outStrides, arrOutStrides.begin());
for(auto& reduce_ptr : reduce_ptrs) for(auto& reduce_ptr : reduce_ptrs)
{ {
...@@ -411,13 +412,12 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -411,13 +412,12 @@ bool profile_reduce_impl_impl(bool do_verification,
bool single_pass; bool single_pass;
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.mData.data());
single_pass = ck::utils::check_err(out.mData, out_ref.mData); single_pass = ck::utils::check_err(out, out_ref);
if(OutputIndex) if(OutputIndex)
{ {
out_indices_dev.FromDevice(out_indices.mData.data()); out_indices_dev.FromDevice(out_indices.mData.data());
single_pass = single_pass && single_pass = single_pass && ck::utils::check_err(out_indices, out_indices_ref);
ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
}; };
if(!single_pass) if(!single_pass)
...@@ -492,7 +492,7 @@ bool profile_reduce_impl(bool do_verification, ...@@ -492,7 +492,7 @@ bool profile_reduce_impl(bool do_verification,
std::array<ck::index_t, descType::NumReduceDim_> arrReduceDims; std::array<ck::index_t, descType::NumReduceDim_> arrReduceDims;
std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin()); ck::ranges::copy(reduceDims, arrReduceDims.begin());
pass = pass && profile_reduce_impl_impl<InDataType, pass = pass && profile_reduce_impl_impl<InDataType,
AccDataType, AccDataType,
......
...@@ -3,55 +3,27 @@ ...@@ -3,55 +3,27 @@
#pragma once #pragma once
#include <algorithm>
#include <iomanip> #include <iomanip>
#include <iostream>
#include <string>
#include <vector>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp" #include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/data_type.hpp" #include "ck/utility/data_type.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
namespace {
using F16 = ck::half_t;
using F32 = float;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
} // namespace
void add_device_softmax_f16_f16_rank3_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>&);
void add_device_softmax_f16_f16_rank4_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>&);
void add_device_softmax_f32_f32_rank3_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>&);
void add_device_softmax_f32_f32_rank4_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>&);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace ck { namespace ck {
namespace profiler { namespace profiler {
enum struct NormType enum struct SoftmaxDataType
{
BATCHNORM,
SOFTMAX,
};
enum struct NormDataType
{ {
F32_F32, // in, out F32_F32, // in, out
F16_F16, F16_F16,
...@@ -60,7 +32,7 @@ enum struct NormDataType ...@@ -60,7 +32,7 @@ enum struct NormDataType
}; };
// clang-format off // clang-format off
template <typename NormDataType> std::string type_to_string(); template <typename SoftmaxDataType> std::string type_to_string();
template <> std::string type_to_string<float>() { return "f32"; } template <> std::string type_to_string<float>() { return "f32"; }
template <> std::string type_to_string<half_t>() { return "f16"; } template <> std::string type_to_string<half_t>() { return "f16"; }
template <> std::string type_to_string<bhalf_t>() { return "bf16"; } template <> std::string type_to_string<bhalf_t>() { return "bf16"; }
...@@ -69,7 +41,7 @@ template <> std::string type_to_string<int32_t>() { return "int32"; } ...@@ -69,7 +41,7 @@ template <> std::string type_to_string<int32_t>() { return "int32"; }
// clang-format on // clang-format on
template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank> template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
void profile_softmax_impl(int do_verification, bool profile_softmax_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
bool time_kernel, bool time_kernel,
...@@ -77,8 +49,7 @@ void profile_softmax_impl(int do_verification, ...@@ -77,8 +49,7 @@ void profile_softmax_impl(int do_verification,
std::vector<index_t> in_strides, std::vector<index_t> in_strides,
std::vector<index_t> reduce_dims, std::vector<index_t> reduce_dims,
AccDataType alpha, AccDataType alpha,
AccDataType beta, AccDataType beta)
NormType norm_type)
{ {
if(Rank != in_length.size()) if(Rank != in_length.size())
{ {
...@@ -88,62 +59,46 @@ void profile_softmax_impl(int do_verification, ...@@ -88,62 +59,46 @@ void profile_softmax_impl(int do_verification,
Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length) Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
: Tensor<InDataType>(in_length, in_strides); : Tensor<InDataType>(in_length, in_strides);
Tensor<OutDataType> out(in.mDesc); Tensor<OutDataType> out(in.mDesc);
Tensor<OutDataType> prior_out(in.mDesc);
switch(init_method) switch(init_method)
{ {
// case 0: break; case 0: break;
case 0:
in.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
out.GenerateTensorValue(GeneratorTensor_1<OutDataType>{});
break;
case 1: case 1:
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); ck::utils::FillUniformDistributionIntegerValue<InDataType>{-5.f, 5.f}(in.begin(), in.end());
out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5}); ck::utils::FillUniformDistributionIntegerValue<OutDataType>{-5.f, 5.f}(prior_out.begin(),
prior_out.end());
break; break;
default: default:
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0}); ck::utils::FillUniformDistribution<InDataType>{0.0f, 1.0f}(in);
out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5}); ck::utils::FillUniformDistribution<OutDataType>{-0.5f, 0.5f}(prior_out);
} }
Tensor<OutDataType> out_ref(out); Tensor<OutDataType> out_ref(prior_out);
if(do_verification)
{
using ReferenceSoftmax =
tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
ReferenceSoftmax{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});
}
DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); DeviceMem in_dev(in.GetElementSpaceSizeInBytes());
DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize()); DeviceMem out_dev(out.GetElementSpaceSizeInBytes());
in_dev.ToDevice(in.mData.data()); in_dev.ToDevice(in.data());
out_dev.ToDevice(out.mData.data());
std::vector<index_t> i_in_lengths(in.mDesc.GetLengths().begin(), in.mDesc.GetLengths().end()); std::vector<index_t> in_tensor_lengths(in.GetLengths().begin(), in.GetLengths().end());
std::vector<index_t> i_in_strides(in.mDesc.GetStrides().begin(), in.mDesc.GetStrides().end()); std::vector<index_t> in_tensor_strides(in.GetStrides().begin(), in.GetStrides().end());
// add device softmax instances // add device softmax instances
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceOpPtr = tensor_operation::device:: using DeviceOp = tensor_operation::device::
DeviceSoftmaxPtr<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>; DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
std::vector<DeviceOpPtr> instances;
if(norm_type == NormType::SOFTMAX) // get device op instances
{ const auto instances = tensor_operation::device::instance::DeviceOperationInstanceFactory<
if constexpr(is_same<InDataType, half_t>::value && is_same<OutDataType, half_t>::value && DeviceOp>::GetInstances();
is_same<AccDataType, float>::value) std::cout << "found " << instances.size() << " instances" << std::endl;
{
if constexpr(Rank == 3)
tensor_operation::device::instance::add_device_softmax_f16_f16_rank3_instances(
instances);
else if constexpr(Rank == 4)
tensor_operation::device::instance::add_device_softmax_f16_f16_rank4_instances(
instances);
}
else if constexpr(is_same<InDataType, float>::value && is_same<OutDataType, float>::value &&
is_same<AccDataType, float>::value)
{
if constexpr(Rank == 3)
tensor_operation::device::instance::add_device_softmax_f32_f32_rank3_instances(
instances);
else if constexpr(Rank == 4)
tensor_operation::device::instance::add_device_softmax_f32_f32_rank4_instances(
instances);
}
}
if(instances.size() <= 0) if(instances.size() <= 0)
{ {
...@@ -153,21 +108,19 @@ void profile_softmax_impl(int do_verification, ...@@ -153,21 +108,19 @@ void profile_softmax_impl(int do_verification,
std::string best_instance_name; std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max(); float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
std::vector<bool> instance_pass;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
for(auto& inst_ptr : instances) for(auto& inst_ptr : instances)
{ {
// Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3 // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
// problem to rank 4 kernel) other than invoking IsSupportedArgument()? // problem to rank 4 kernel) other than invoking IsSupportedArgument()?
if(!(inst_ptr->GetRank() == static_cast<index_t>(i_in_lengths.size()) && if(!(inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
{ {
continue; continue;
} }
auto argument_ptr = inst_ptr->MakeArgumentPointer(i_in_lengths, auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths,
i_in_strides, in_tensor_strides,
reduce_dims, reduce_dims,
&alpha, &alpha,
&beta, &beta,
...@@ -181,45 +134,42 @@ void profile_softmax_impl(int do_verification, ...@@ -181,45 +134,42 @@ void profile_softmax_impl(int do_verification,
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: "; std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
LogRange(std::cout << "input lengths = [", in_length, ", ") LogRange(std::cout << "input lengths = [", in_length, ", ")
<< "], " << "], "
<< "scaler = [" << alpha << ", " << beta << "]." << std::endl; << "scaler = [" << alpha << ", " << beta << "]";
return; LogRange(std::cout << ", reduce dims = [", reduce_dims, ", ") << "]." << std::endl;
instance_pass.push_back(true);
continue;
} }
out_dev.ToDevice(prior_out.data());
auto invoker_ptr = inst_ptr->MakeInvokerPointer(); auto invoker_ptr = inst_ptr->MakeInvokerPointer();
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); if(time_kernel)
{
std::size_t num_bytes = std::size_t num_bytes =
in.mDesc.GetElementSize() * sizeof(InDataType) + in.GetElementSize() * sizeof(InDataType) +
(beta == 0.0f ? 1 : 2) * out.mDesc.GetElementSize() * sizeof(OutDataType); (beta == 0.0f ? 1 : 2) * out.GetElementSize() * sizeof(OutDataType);
float gb_per_sec = num_bytes / 1.E6 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< inst_ptr->GetTypeString() << std::endl; << inst_ptr->GetTypeString() << std::endl;
if(avg_time < best_avg_time) if(avg_time < best_avg_time)
{ {
best_instance_name = inst_ptr->GetTypeString(); best_instance_name = inst_ptr->GetTypeString();
best_avg_time = avg_time; best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec; best_gb_per_sec = gb_per_sec;
}
} }
if(do_verification) if(do_verification)
{ {
// TODO: factory method to dynamically switch between different reference normalizations out_dev.FromDevice(out.data());
using ReferenceFactory = bool pass = true;
tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
ReferenceFactory{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});
out_dev.FromDevice(out.mData.data());
bool pass;
if(std::is_same<InDataType, int8_t>::value) if(std::is_same<InDataType, int8_t>::value)
{ {
pass = ck::utils::check_err( pass = pass && ck::utils::check_err(
out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1); out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1);
if(do_log) if(do_log)
{ {
LogRangeAsType<int>(std::cout << "in : ", in.mData, ",") << std::endl; LogRangeAsType<int>(std::cout << "in : ", in.mData, ",") << std::endl;
...@@ -230,7 +180,7 @@ void profile_softmax_impl(int do_verification, ...@@ -230,7 +180,7 @@ void profile_softmax_impl(int do_verification,
} }
else else
{ {
pass = ck::utils::check_err(out.mData, out_ref.mData); pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
...@@ -247,16 +197,22 @@ void profile_softmax_impl(int do_verification, ...@@ -247,16 +197,22 @@ void profile_softmax_impl(int do_verification,
<< "], " << "], "
<< "scaler = [" << alpha << ", " << beta << "]." << std::endl; << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
} }
instance_pass.push_back(pass);
} }
} }
std::cout << "Best Perf for datatype = " << type_to_string<InDataType>() << "_" if(time_kernel)
<< type_to_string<OutDataType>() << ", "; {
LogRange(std::cout << "length = ", i_in_lengths, ",") << ", "; std::cout << "Best Perf for datatype = " << type_to_string<InDataType>() << "_"
LogRange(std::cout << "stride = ", i_in_strides, ",") << ", "; << type_to_string<OutDataType>() << ", ";
LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", "; LogRange(std::cout << "length = ", in_tensor_lengths, ",") << ", ";
std::cout << "alpha = " << alpha << ", " LogRange(std::cout << "stride = ", in_tensor_strides, ",") << ", ";
<< "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", ";
<< " GB/s, " << best_instance_name << std::endl; std::cout << "alpha = " << alpha << ", "
<< "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec
<< " GB/s, " << best_instance_name << std::endl;
}
return std::all_of(
std::begin(instance_pass), std::end(instance_pass), [](bool p) { return p; });
} }
} // namespace profiler } // namespace profiler
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <getopt.h>
#include "ck/library/utility/host_common_util.hpp"
#include "profiler/include/profile_batchnorm_backward_impl.hpp"
using ck::index_t;
using namespace std;
static const struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
{"reduceDims", required_argument, nullptr, 'R'},
{"dumpout", required_argument, nullptr, 'o'},
{"verify", required_argument, nullptr, 'v'},
{"help", no_argument, nullptr, '?'},
{nullptr, 0, nullptr, 0}};
class BatchnormBwdArgParser
{
private:
int option_index = 0;
public:
std::vector<size_t> inLengths;
std::vector<int> reduceDims;
bool do_verification = false;
bool do_dumpout = false;
bool haveSavedMeanInvVar;
int data_type = 0;
int init_method = 2;
bool time_kernel = false;
BatchnormBwdArgParser() = default;
~BatchnormBwdArgParser() = default;
void show_usage(const char* cmd)
{
// clang-format off
std::cout << "Usage of " << cmd << std::endl;
std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc" << std::endl;
std::cout << "--reduceDims or -R, comma separated list of dimensions to reduce on" << std::endl;
std::cout << "--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization" << std::endl;
std::cout << "Arg1: data type (0: fp16, 1: fp32, 5: bp16, 6: fp64)" << std::endl;
std::cout << "Arg2 -- 1/0 to indicate whether to use saved mean and invVariance" << std::endl;
std::cout << "Arg3 -- init method used for dy and bnScale (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)" << std::endl;
std::cout << "Arg4 -- time kernel (0=no, 1=yes)" << std::endl;
// clang-format on
};
int operator()(int argc, char* argv[])
{
using ck::host_common::getTypeValuesFromString;
int ch;
optind++; // to skip the module name
while(1)
{
ch = getopt_long(argc, argv, "D:R:v:o:", long_options, &option_index);
if(ch == -1)
break;
switch(ch)
{
case 'D':
if(!optarg)
throw std::runtime_error("Invalid option format!");
inLengths = getTypeValuesFromString<size_t>(optarg);
break;
case 'R':
if(!optarg)
throw std::runtime_error("Invalid option format!");
reduceDims = getTypeValuesFromString<int>(optarg);
break;
case 'v':
if(!optarg)
throw std::runtime_error("Invalid option format!");
do_verification = static_cast<bool>(std::atoi(optarg));
break;
case 'o':
if(!optarg)
throw std::runtime_error("Invalid option format!");
do_dumpout = static_cast<bool>(std::atoi(optarg));
break;
case '?':
if(std::string(long_options[option_index].name) == "help")
{
show_usage(argv[0]);
return -1;
};
break;
default:
show_usage(argv[0]);
std::cerr << "Invalid cmd-line options!" << std::endl;
return -1;
};
};
if(optind + 4 > argc)
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
data_type = std::atoi(argv[optind++]);
haveSavedMeanInvVar = std::atoi(argv[optind++]);
init_method = std::atoi(argv[optind++]);
time_kernel = static_cast<bool>(std::atoi(argv[optind++]));
if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
return -1;
return 0;
};
}; // end of class AppArgs
static const double epsilon = std::numeric_limits<float>::epsilon();
int profile_batchnorm_backward(int argc, char* argv[])
{
using ck::profiler::profile_batchnorm_backward_impl;
BatchnormBwdArgParser arg_parser;
if(arg_parser(argc, argv) != 0)
return -1;
using F16 = ck::half_t;
using F32 = float;
using BF16 = ck::bhalf_t;
using F64 = double;
if(arg_parser.data_type == 0)
{
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
{
profile_batchnorm_backward_impl<F16, F32, F32, F32, F16, F32, F32, 4, 3>(
arg_parser.do_verification,
arg_parser.init_method,
arg_parser.do_dumpout,
arg_parser.time_kernel,
arg_parser.inLengths,
arg_parser.reduceDims,
arg_parser.haveSavedMeanInvVar,
epsilon);
};
}
else if(arg_parser.data_type == 1)
{
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
{
profile_batchnorm_backward_impl<F32, F32, F32, F32, F32, F32, F32, 4, 3>(
arg_parser.do_verification,
arg_parser.init_method,
arg_parser.do_dumpout,
arg_parser.time_kernel,
arg_parser.inLengths,
arg_parser.reduceDims,
arg_parser.haveSavedMeanInvVar,
epsilon);
};
}
else if(arg_parser.data_type == 5)
{
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
{
profile_batchnorm_backward_impl<BF16, F32, F32, F32, BF16, F32, F32, 4, 3>(
arg_parser.do_verification,
arg_parser.init_method,
arg_parser.do_dumpout,
arg_parser.time_kernel,
arg_parser.inLengths,
arg_parser.reduceDims,
arg_parser.haveSavedMeanInvVar,
epsilon);
};
}
else if(arg_parser.data_type == 6)
{
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
{
profile_batchnorm_backward_impl<F64, F64, F64, F64, F64, F64, F64, 4, 3>(
arg_parser.do_verification,
arg_parser.init_method,
arg_parser.do_dumpout,
arg_parser.time_kernel,
arg_parser.inLengths,
arg_parser.reduceDims,
arg_parser.haveSavedMeanInvVar,
epsilon);
};
}
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment