Commit cbcc844e authored by illsilin's avatar illsilin
Browse files

merge from public repo

parents 29deceb6 1f306024
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <cstdlib> #include <cstdlib>
...@@ -67,6 +70,8 @@ int main() ...@@ -67,6 +70,8 @@ int main()
float scale = 1.f; float scale = 1.f;
auto i = 0; auto i = 0;
std::mt19937 gen(11939);
std::uniform_int_distribution<int> dis(0, 1);
for(std::size_t w = 0; w < a.mDesc.GetLengths()[3]; ++w) for(std::size_t w = 0; w < a.mDesc.GetLengths()[3]; ++w)
for(std::size_t h = 0; h < a.mDesc.GetLengths()[2]; ++h) for(std::size_t h = 0; h < a.mDesc.GetLengths()[2]; ++h)
for(std::size_t c = 0; c < a.mDesc.GetLengths()[1]; ++c) for(std::size_t c = 0; c < a.mDesc.GetLengths()[1]; ++c)
...@@ -74,7 +79,7 @@ int main() ...@@ -74,7 +79,7 @@ int main()
{ {
a.mData[(n * nchw[1] * nchw[2] * nchw[3]) + (c * nchw[2] * nchw[3]) + a.mData[(n * nchw[1] * nchw[2] * nchw[3]) + (c * nchw[2] * nchw[3]) +
(h * nchw[3]) + w] = i; (h * nchw[3]) + w] = i;
i++; i = dis(gen);
} }
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <cstdlib> #include <cstdlib>
......
...@@ -32,6 +32,8 @@ std::vector<ck::index_t> f_tensor_strides_ncdhw(ck::index_t N_, ...@@ -32,6 +32,8 @@ std::vector<ck::index_t> f_tensor_strides_ncdhw(ck::index_t N_,
return {C_ * D * H * W, D * H * W, H * W, W, 1_uz}; return {C_ * D * H * W, D * H * W, H * W, W, 1_uz};
else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value) else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
return {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_}; return {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_};
throw std::runtime_error("Pool3d_fwd: problem with layout. ");
return {0, 0, 0, 0, 0};
}; };
template <typename TensorLayout> template <typename TensorLayout>
...@@ -53,6 +55,8 @@ HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_, ...@@ -53,6 +55,8 @@ HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_,
return HostTensorDescriptor({N_, C_, D, H, W}, return HostTensorDescriptor({N_, C_, D, H, W},
{D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_}); {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
} }
throw std::runtime_error("Pool3d_fwd: problem with layout. ");
return HostTensorDescriptor({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0});
}; };
template <typename DevicePoolFwdInstance, template <typename DevicePoolFwdInstance,
......
...@@ -26,6 +26,8 @@ std::vector<ck::index_t> f_tensor_strides_ncdhw(ck::index_t N_, ...@@ -26,6 +26,8 @@ std::vector<ck::index_t> f_tensor_strides_ncdhw(ck::index_t N_,
return {C_ * D * H * W, D * H * W, H * W, W, 1_uz}; return {C_ * D * H * W, D * H * W, H * W, W, 1_uz};
else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value) else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
return {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_}; return {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_};
throw std::runtime_error("Avgpool3d_bwd: problem with layout. ");
return {0, 0, 0, 0, 0};
}; };
template <typename TensorLayout> template <typename TensorLayout>
...@@ -47,6 +49,8 @@ HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_, ...@@ -47,6 +49,8 @@ HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_,
return HostTensorDescriptor({N_, C_, D, H, W}, return HostTensorDescriptor({N_, C_, D, H, W},
{D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_}); {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
} }
throw std::runtime_error("Avgpool3d_bwd: problem with layout. ");
return HostTensorDescriptor({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0});
}; };
template <typename DevicePoolBwdInstance, template <typename DevicePoolBwdInstance,
......
add_example_executable(example_layernorm2d_bwd_fp32 layernorm2d_bwd_fp32.cpp)
...@@ -15,16 +15,17 @@ ...@@ -15,16 +15,17 @@
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_bwd_data_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_bwd_gamma_beta_impl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_normalization_bwd_gamma_beta_impl.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm_bwd.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_layernorm_bwd.hpp"
using DYDataType = ck::half_t; using DYDataType = float;
using XDataType = ck::half_t; using XDataType = float;
using GammaDataType = ck::half_t; using GammaDataType = float;
using MeanInvStdDataType = float; using MeanInvStdDataType = float;
using DGammaDataType = ck::half_t; using DGammaDataType = float;
using DBetaDataType = ck::half_t; using DBetaDataType = float;
using DXDataType = ck::half_t; using DXDataType = float;
using ComputeDataType = float; using ComputeDataType = float;
constexpr int Rank = 2; constexpr int Rank = 2;
...@@ -39,6 +40,7 @@ constexpr int NumReduceDim = 1; ...@@ -39,6 +40,7 @@ constexpr int NumReduceDim = 1;
// inv_std: [M, 1] // inv_std: [M, 1]
// Output shape // Output shape
// dx: [M, N]
// dgamma: [1, N] // dgamma: [1, N]
// dbeta: [1, N] // dbeta: [1, N]
...@@ -46,8 +48,34 @@ constexpr int NumReduceDim = 1; ...@@ -46,8 +48,34 @@ constexpr int NumReduceDim = 1;
// dbeta = reduce_sum(dy, axis=0) // dbeta = reduce_sum(dy, axis=0)
// [CAUSION] // [CAUSION]
// In DeviceNormalizationBwdGammaBetaImpl, M is invarient dimension, K is reduced dimension // In DeviceNormalizationBwdDataImpl & DeviceNormalizationBwdGammaBetaImpl, M is Invariant
// Hence, M in this example and DeviceNormalizationBwdGammaBetaImpl is different // dimension, K is reduced dimension Hence, M in this example and
// DeviceNormalizationBwdGammaBetaImpl is different
using XDeviceInstance = ck::tensor_operation::device::DeviceNormalizationBwdDataImpl<
DYDataType,
XDataType,
GammaDataType,
MeanInvStdDataType,
ComputeDataType,
DXDataType,
Rank,
NumReduceDim,
256, // BlockSize
8, // MThreadClusterSize
32, // KThreadClusterSize
1, // MThreadSliceSize
4, // KThreadSliceSize
true, // IsDYFastestDimReduced
4, // DYSrcVectorSize
true, // IsXFastestDimReduced
4, // XSrcVectorSize
true, // IsGammaFastestDimReduced
4, // GammaSrcVectorSize
false, // IsMeanInvStdFastestDimReduced
1, // MeanInvStdSrcVectorSize
true, // IsDXFastestDimReduced
4>; // DXDstVectorSize
using GammaBetaDeviceInstance = ck::tensor_operation::device::DeviceNormalizationBwdGammaBetaImpl< using GammaBetaDeviceInstance = ck::tensor_operation::device::DeviceNormalizationBwdGammaBetaImpl<
DYDataType, DYDataType,
XDataType, XDataType,
...@@ -58,18 +86,18 @@ using GammaBetaDeviceInstance = ck::tensor_operation::device::DeviceNormalizatio ...@@ -58,18 +86,18 @@ using GammaBetaDeviceInstance = ck::tensor_operation::device::DeviceNormalizatio
Rank, Rank,
NumReduceDim, NumReduceDim,
256, // BlockSize 256, // BlockSize
8, // ClusterInvarient 8, // MThreadClusterSize
32, // ClusterReduce 32, // KThreadClusterSize
8, // SliceInvarient 4, // MThreadSliceSize
1, // SliceReduce 1, // KThreadSliceSize
false, // IsDYFastestDimReduced false, // IsDYFastestDimReduced
8, // DYSrcVectorSize 4, // DYSrcVectorSize
false, // IsXFastestDimReduced false, // IsXFastestDimReduced
8, // XSrcVectorSize 4, // XSrcVectorSize
true, // IsMeanInvStdFastestDimReduced true, // IsMeanInvStdFastestDimReduced
1, // MeanInvStdSrcVectorSize 1, // MeanInvStdSrcVectorSize
1, // DGammaDstVectorSize 4, // DGammaDstVectorSize
1>; // DBetaDstVectorSize 4>; // DBetaDstVectorSize
int main() int main()
{ {
...@@ -96,16 +124,48 @@ int main() ...@@ -96,16 +124,48 @@ int main()
DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize()); DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize());
DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize()); DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize()); DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize());
DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize()); DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize());
DeviceMem dx_dev(sizeof(DXDataType) * dx.mDesc.GetElementSpaceSize());
DeviceMem dgamma_dev(sizeof(DGammaDataType) * dgamma.mDesc.GetElementSpaceSize()); DeviceMem dgamma_dev(sizeof(DGammaDataType) * dgamma.mDesc.GetElementSpaceSize());
DeviceMem dbeta_dev(sizeof(DBetaDataType) * dbeta.mDesc.GetElementSpaceSize()); DeviceMem dbeta_dev(sizeof(DBetaDataType) * dbeta.mDesc.GetElementSpaceSize());
dy_dev.ToDevice(dy.mData.data()); dy_dev.ToDevice(dy.mData.data());
x_dev.ToDevice(x.mData.data()); x_dev.ToDevice(x.mData.data());
gamma_dev.ToDevice(gamma.mData.data());
mean_dev.ToDevice(mean.mData.data()); mean_dev.ToDevice(mean.mData.data());
inv_std_dev.ToDevice(inv_std.mData.data()); inv_std_dev.ToDevice(inv_std.mData.data());
// backward x
auto x_device_instance = XDeviceInstance{};
auto x_argument_ptr = x_device_instance.MakeArgumentPointer({M, N}, // lengths
{N, 1}, // dyStrides
{N, 1}, // xStrides
{0, 1}, // gammaStrides
{1, 0}, // meanStrides
{1, 0}, // invStdStrides
{N, 1}, // dxStrides
{1}, // reduceDims
dy_dev.GetDeviceBuffer(),
x_dev.GetDeviceBuffer(),
gamma_dev.GetDeviceBuffer(),
mean_dev.GetDeviceBuffer(),
inv_std_dev.GetDeviceBuffer(),
dx_dev.GetDeviceBuffer());
if(!x_device_instance.IsSupportedArgument(x_argument_ptr.get()))
{
std::cout << "The runtime parameters are not supported." << __FILE__ << ":" << __LINE__
<< std::endl;
return 1;
};
auto x_invoker_ptr = x_device_instance.MakeInvokerPointer();
x_invoker_ptr->Run(x_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
// backward gamma & beta
auto gamma_beta_device_instance = GammaBetaDeviceInstance{}; auto gamma_beta_device_instance = GammaBetaDeviceInstance{};
auto gamma_beta_argument_ptr = auto gamma_beta_argument_ptr =
gamma_beta_device_instance.MakeArgumentPointer({M, N}, // inLengths gamma_beta_device_instance.MakeArgumentPointer({M, N}, // inLengths
...@@ -126,7 +186,8 @@ int main() ...@@ -126,7 +186,8 @@ int main()
if(!gamma_beta_device_instance.IsSupportedArgument(gamma_beta_argument_ptr.get())) if(!gamma_beta_device_instance.IsSupportedArgument(gamma_beta_argument_ptr.get()))
{ {
std::cout << "The runtime parameters are not supported" << std::endl; std::cout << "The runtime parameters are not supported." << __FILE__ << ":" << __LINE__
<< std::endl;
return 1; return 1;
}; };
...@@ -156,9 +217,11 @@ int main() ...@@ -156,9 +217,11 @@ int main()
dgamma_dev.FromDevice(dgamma.mData.data()); dgamma_dev.FromDevice(dgamma.mData.data());
dbeta_dev.FromDevice(dbeta.mData.data()); dbeta_dev.FromDevice(dbeta.mData.data());
dx_dev.FromDevice(dx.mData.data());
pass &= ck::utils::check_err(dgamma, host_dgamma, "Error: Incorrect dgamma", 1e-3, 1e-3); pass &= ck::utils::check_err(dgamma, host_dgamma, "Error: Incorrect dgamma", 1e-3, 1e-3);
pass &= ck::utils::check_err(dbeta, host_dbeta, "Error: Incorrect dbeta", 1e-3, 1e-3); pass &= ck::utils::check_err(dbeta, host_dbeta, "Error: Incorrect dbeta", 1e-3, 1e-3);
pass &= ck::utils::check_err(dx, host_dx, "Error: Incorrect dx", 1e-3, 1e-3);
} }
return (pass ? 0 : 1); return (pass ? 0 : 1);
......
add_example_executable(example_layernorm2d_bwd_fp16 layernorm2d_bwd_fp16.cpp)
add_example_executable(example_groupnorm_bwd_fp16 groupnorm_bwd_fp16.cpp) add_example_executable(example_groupnorm_bwd_fp32 groupnorm_bwd_fp32.cpp)
...@@ -15,23 +15,58 @@ ...@@ -15,23 +15,58 @@
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_bwd_data_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_bwd_gamma_beta_impl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_normalization_bwd_gamma_beta_impl.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp"
using DYDataType = ck::half_t; using DYDataType = float;
using XDataType = ck::half_t; using XDataType = float;
using GammaDataType = ck::half_t; using GammaDataType = float;
using MeanInvStdDataType = float; using MeanInvStdDataType = float;
using DGammaDataType = ck::half_t; using DGammaDataType = float;
using DBetaDataType = ck::half_t; using DBetaDataType = float;
using DXDataType = ck::half_t; using DXDataType = float;
using ComputeDataType = float; using ComputeDataType = float;
constexpr int Rank = 5; constexpr int Rank = 5;
constexpr int NumReduceDim = 3; constexpr int NumReduceDim = 3;
// Grouprnorm // Grouprnorm
// kernel: M , K // kernel 1: M , K
// dy: N, H, W, G, C -> N * G, H * W * C
// x: N, H, W, G, C -> N * G, H * W * C
// gamma: 1, 1, 1, G, C -> 1 * G, 1 * 1 * C
// mean: N, 1, 1, G, 1 -> N * G, 1 * 1 * 1
// rstd: N, 1, 1, G, 1 -> N * G, 1 * 1 * 1
// dx: N, H, W, G, C -> N * G, H * W * C
using XDeviceInstance = ck::tensor_operation::device::DeviceNormalizationBwdDataImpl<
DYDataType,
XDataType,
GammaDataType,
MeanInvStdDataType,
ComputeDataType,
DXDataType,
Rank,
NumReduceDim,
256, // BlockSize
8, // MThreadClusterSize
32, // KThreadClusterSize
1, // MThreadSliceSize
4, // KThreadSliceSize
true, // IsDYFastestDimReduced
4, // DYSrcVectorSize
true, // IsXFastestDimReduced
4, // XSrcVectorSize
true, // IsGammaFastestDimReduced
4, // GammaSrcVectorSize
false, // IsMeanInvStdFastestDimReduced
1, // MeanInvStdSrcVectorSize
true, // IsDXFastestDimReduced
4>; // DXDstVectorSize
// kernel 2: M , K
// dy: N, H, W, G, C -> G * C, N * H * W // dy: N, H, W, G, C -> G * C, N * H * W
// x: N, H, W, G, C -> G * C, N * H * W // x: N, H, W, G, C -> G * C, N * H * W
// mean: N, 1, 1, G, 1 -> G * 1, N * 1 * 1 // mean: N, 1, 1, G, 1 -> G * 1, N * 1 * 1
...@@ -52,18 +87,18 @@ using GammaBetaDeviceInstance = ck::tensor_operation::device::DeviceNormalizatio ...@@ -52,18 +87,18 @@ using GammaBetaDeviceInstance = ck::tensor_operation::device::DeviceNormalizatio
Rank, Rank,
NumReduceDim, NumReduceDim,
256, // BlockSize 256, // BlockSize
8, // ClusterInvarient 8, // ClusterInvariant
32, // ClusterReduce 32, // ClusterReduce
8, // SliceInvarient 4, // SliceInvariant
1, // SliceReduce 1, // SliceReduce
false, // IsDYFastestDimReduced false, // IsDYFastestDimReduced
8, // DYSrcVectorSize 4, // DYSrcVectorSize
false, // IsXFastestDimReduced false, // IsXFastestDimReduced
8, // XSrcVectorSize 4, // XSrcVectorSize
false, // IsMeanInvStdFastestDimReduced false, // IsMeanInvStdFastestDimReduced
1, // MeanInvStdSrcVectorSize 1, // MeanInvStdSrcVectorSize
1, // DGammaDstVectorSize 4, // DGammaDstVectorSize
1>; // DBetaDstVectorSize 4>; // DBetaDstVectorSize
int main() int main()
{ {
...@@ -93,20 +128,55 @@ int main() ...@@ -93,20 +128,55 @@ int main()
DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize()); DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize());
DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize()); DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize()); DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize());
DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize()); DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize());
DeviceMem dx_dev(sizeof(DXDataType) * dx.mDesc.GetElementSpaceSize());
DeviceMem dgamma_dev(sizeof(DGammaDataType) * dgamma.mDesc.GetElementSpaceSize()); DeviceMem dgamma_dev(sizeof(DGammaDataType) * dgamma.mDesc.GetElementSpaceSize());
DeviceMem dbeta_dev(sizeof(DBetaDataType) * dbeta.mDesc.GetElementSpaceSize()); DeviceMem dbeta_dev(sizeof(DBetaDataType) * dbeta.mDesc.GetElementSpaceSize());
dy_dev.ToDevice(dy.mData.data()); dy_dev.ToDevice(dy.mData.data());
x_dev.ToDevice(x.mData.data()); x_dev.ToDevice(x.mData.data());
gamma_dev.ToDevice(gamma.mData.data());
mean_dev.ToDevice(mean.mData.data()); mean_dev.ToDevice(mean.mData.data());
inv_std_dev.ToDevice(inv_std.mData.data()); inv_std_dev.ToDevice(inv_std.mData.data());
std::vector<ck::index_t> dyStrides{dy.mDesc.GetStrides().begin(), dy.mDesc.GetStrides().end()}; std::vector<ck::index_t> dyStrides{dy.mDesc.GetStrides().begin(), dy.mDesc.GetStrides().end()};
std::vector<ck::index_t> xStrides{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()}; std::vector<ck::index_t> xStrides{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()};
std::vector<ck::index_t> gammaStrides = {0, 0, 0, C, 1};
std::vector<ck::index_t> meanStrides = {G, 0, 0, 1, 0}; std::vector<ck::index_t> meanStrides = {G, 0, 0, 1, 0};
std::vector<ck::index_t> invStdStrides = {G, 0, 0, 1, 0}; std::vector<ck::index_t> invStdStrides = {G, 0, 0, 1, 0};
std::vector<ck::index_t> dxStrides{dx.mDesc.GetStrides().begin(), dx.mDesc.GetStrides().end()};
// backward x
auto x_device_instance = XDeviceInstance{};
auto x_argument_ptr = x_device_instance.MakeArgumentPointer({N, H, W, G, C}, // lengths
dyStrides, // dyStrides
xStrides, // xStrides
gammaStrides, // gammaStrides
meanStrides, // meanStrides
invStdStrides, // invStdStrides
dxStrides, // dxStrides
{1, 2, 4}, // reduceDims
dy_dev.GetDeviceBuffer(),
x_dev.GetDeviceBuffer(),
gamma_dev.GetDeviceBuffer(),
mean_dev.GetDeviceBuffer(),
inv_std_dev.GetDeviceBuffer(),
dx_dev.GetDeviceBuffer());
if(!x_device_instance.IsSupportedArgument(x_argument_ptr.get()))
{
std::cout << "The runtime parameters are not supported." << __FILE__ << ":" << __LINE__
<< std::endl;
return 1;
};
auto x_invoker_ptr = x_device_instance.MakeInvokerPointer();
x_invoker_ptr->Run(x_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
// backward gamma & beta
auto gamma_beta_device_instance = GammaBetaDeviceInstance{}; auto gamma_beta_device_instance = GammaBetaDeviceInstance{};
auto gamma_beta_argument_ptr = auto gamma_beta_argument_ptr =
...@@ -128,7 +198,8 @@ int main() ...@@ -128,7 +198,8 @@ int main()
if(!gamma_beta_device_instance.IsSupportedArgument(gamma_beta_argument_ptr.get())) if(!gamma_beta_device_instance.IsSupportedArgument(gamma_beta_argument_ptr.get()))
{ {
std::cout << "The runtime parameters are not supported" << std::endl; std::cout << "The runtime parameters are not supported." << __FILE__ << ":" << __LINE__
<< std::endl;
return 1; return 1;
}; };
...@@ -158,9 +229,11 @@ int main() ...@@ -158,9 +229,11 @@ int main()
dgamma_dev.FromDevice(dgamma.mData.data()); dgamma_dev.FromDevice(dgamma.mData.data());
dbeta_dev.FromDevice(dbeta.mData.data()); dbeta_dev.FromDevice(dbeta.mData.data());
dx_dev.FromDevice(dx.mData.data());
pass &= ck::utils::check_err(dgamma, host_dgamma, "Error: Incorrect dgamma", 1e-3, 1e-3); pass &= ck::utils::check_err(dgamma, host_dgamma, "Error: Incorrect dgamma", 1e-3, 1e-3);
pass &= ck::utils::check_err(dbeta, host_dbeta, "Error: Incorrect dbeta", 1e-3, 1e-3); pass &= ck::utils::check_err(dbeta, host_dbeta, "Error: Incorrect dbeta", 1e-3, 1e-3);
pass &= ck::utils::check_err(dx, host_dx, "Error: Incorrect dx", 1e-3, 1e-3);
} }
return (pass ? 0 : 1); return (pass ? 0 : 1);
......
...@@ -42,6 +42,8 @@ foreach(gpu IN LISTS GPU_TARGETS) ...@@ -42,6 +42,8 @@ foreach(gpu IN LISTS GPU_TARGETS)
# ScaleAdd ScaleAdd Relu # ScaleAdd ScaleAdd Relu
add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp) add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp)
add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16) add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16)
add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp)
add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16)
set(target 1) set(target 1)
endif() endif()
endforeach() endforeach()
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <algorithm>
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
constexpr ck::index_t NDimSpatial = 3;
using InDataType = ck::half_t;
using WeiDataType = ck::half_t;
using AccDataType = float;
using CShuffleDataType = ck::half_t;
using OutDataType = ck::half_t;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InLayout = ck::tensor_layout::convolution::NDHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
using BiasLayout = ck::tensor_layout::convolution::G_K;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <typename OutElementOp>
using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<OutLayout, BiasLayout>,
OutLayout,
InDataType,
WeiDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<OutDataType, OutDataType>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
8, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
8, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1,
1,
S<1, 32, 1, 8>,
8>;
using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
namespace {
// Use custom implementation to pass two more tensors for post op
template <ck::index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InElementOp,
typename WeiElementOp,
typename OutElementOp,
typename DeviceConvNDFwdInstance>
bool run_grouped_conv_fwd(bool do_verification,
int init_method,
bool time_kernel,
const ck::utils::conv::ConvParam& conv_param,
const HostTensorDescriptor& in_g_n_c_wis_desc,
const HostTensorDescriptor& wei_g_k_c_xs_desc,
const HostTensorDescriptor& out_g_n_k_wos_desc,
const InElementOp& in_element_op,
const WeiElementOp& wei_element_op,
const OutElementOp& out_element_op)
{
constexpr ck::index_t NumDs = 2;
const ck::index_t G = out_g_n_k_wos_desc.GetLengths()[0];
const ck::index_t K = out_g_n_k_wos_desc.GetLengths()[2];
// Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
std::array<ck::index_t, NDimSpatial + 3> bias_g_k_lengths;
std::array<ck::index_t, NDimSpatial + 3> bias_g_k_strides;
// Fill other lenghts than G,K with 1 and strides with 0
bias_g_k_lengths.fill(1);
bias_g_k_strides.fill(0);
bias_g_k_lengths[0] = G;
bias_g_k_lengths[2] = K;
bias_g_k_strides[0] = K; // stride to G
bias_g_k_strides[2] = 1; // stride to K
const auto broadcasted_bias_desc = HostTensorDescriptor(bias_g_k_lengths, bias_g_k_strides);
// y = relu ( alpha1 * conv(x) + alpha2 * z + bias )
Tensor<InDataType> in(in_g_n_c_wis_desc);
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
std::array<Tensor<OutDataType>, NumDs> d_tensors = {Tensor<OutDataType>(out_g_n_k_wos_desc),
Tensor<OutDataType>(broadcasted_bias_desc)};
std::cout << "in: " << in.mDesc << std::endl;
std::cout << "wei: " << wei.mDesc << std::endl;
std::cout << "out: " << out_host.mDesc << std::endl;
std::cout << "z_tensor: " << d_tensors[0].mDesc << std::endl;
std::cout << "bias_tensor: " << d_tensors[1].mDesc << std::endl;
// Make sure that we allocated only G * K values for bias
assert(static_cast<ck::index_t>(d_tensors[1].mData.size()) == G * K);
switch(init_method)
{
case 0: break;
case 1:
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
d_tensors[0].GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
d_tensors[1].GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
break;
default:
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.05, 0.05});
d_tensors[0].GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
d_tensors[1].GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
}
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
DeviceMem z_buf(sizeof(OutDataType) * d_tensors[0].mDesc.GetElementSpaceSize());
DeviceMem bias_buf(sizeof(OutDataType) * d_tensors[1].mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data());
z_buf.ToDevice(d_tensors[0].mData.data());
bias_buf.ToDevice(d_tensors[1].mData.data());
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);
const std::array<const void*, NumDs> ds = {z_buf.GetDeviceBuffer(), bias_buf.GetDeviceBuffer()};
auto conv = DeviceConvNDFwdInstance{};
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
ds,
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{
e_g_n_k_wos_lengths, bias_g_k_lengths},
std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{
e_g_n_k_wos_strides, bias_g_k_strides},
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
if(!conv.IsSupportedArgument(argument))
{
throw std::runtime_error("The device op with the specified compilation parameters does "
"not support this convolution problem.");
}
float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = conv_param.GetFlops() + G * K +
conv_param.GetOutputByte<OutDataType>() / sizeof(OutDataType);
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
G * K * sizeof(OutDataType) + conv_param.GetOutputByte<OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< conv.GetTypeString() << std::endl;
if(do_verification)
{
auto ref_conv =
ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
0, /*Num A Elementwise Tensors*/
0, /*Num B Elementwise Tensors*/
NumDs>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in,
wei,
out_host,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_,
in_element_op,
wei_element_op,
out_element_op,
{},
{},
d_tensors);
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(out_device.mData.data());
return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
}
return true;
}
} // namespace
#include "run_convnd_fwd_activ_example.inc"
int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
...@@ -24,7 +24,7 @@ bool run_convnd_fwd_example(int argc, char* argv[]) ...@@ -24,7 +24,7 @@ bool run_convnd_fwd_example(int argc, char* argv[])
// Following shapes are selected to avoid overflow. Expect inf in case of // Following shapes are selected to avoid overflow. Expect inf in case of
// size increase for some elementwise ops. // size increase for some elementwise ops.
ck::utils::conv::ConvParam conv_param{ ck::utils::conv::ConvParam conv_param{
3, 1, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}; 3, 2, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
if(argc == 1) if(argc == 1)
{ {
......
...@@ -44,16 +44,30 @@ ...@@ -44,16 +44,30 @@
#define CK_USE_WAVES_PER_EU 0 #define CK_USE_WAVES_PER_EU 0
#endif #endif
// define general macros for various architectures
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#define __gfx94__
#endif
#if defined(__gfx1010__) || defined(__gfx1011__) || defined(__gfx1012__)
#define __gfx101__
#endif
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__)
#define __gfx103__
#endif
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__)
#define __gfx11__
#endif
// buffer resource // buffer resource
#ifndef __HIP_DEVICE_COMPILE__ // for host code #ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_BUFFER_RESOURCE_3RD_DWORD -1 #define CK_BUFFER_RESOURCE_3RD_DWORD -1
#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \ #elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \ defined(__gfx90a__) || defined(__gfx94__)
defined(__gfx942__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#elif defined(__gfx1030__) // for GPU code #elif defined(__gfx103__)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code #elif defined(__gfx11__)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31004000 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31004000
#endif #endif
...@@ -61,12 +75,12 @@ ...@@ -61,12 +75,12 @@
#ifndef __HIP_DEVICE_COMPILE__ // for host code, define nothing #ifndef __HIP_DEVICE_COMPILE__ // for host code, define nothing
#elif defined(__gfx803__) || defined(__gfx900__) // for GPU code #elif defined(__gfx803__) || defined(__gfx900__) // for GPU code
#define CK_USE_AMD_V_MAC_F32 #define CK_USE_AMD_V_MAC_F32
#elif defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) || \ #elif defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx103__) || \
defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) // for GPU code defined(__gfx94__) // for GPU code
#define CK_USE_AMD_V_FMAC_F32 #define CK_USE_AMD_V_FMAC_F32
#define CK_USE_AMD_V_DOT2_F32_F16 #define CK_USE_AMD_V_DOT2_F32_F16
#define CK_USE_AMD_V_DOT4_I32_I8 #define CK_USE_AMD_V_DOT4_I32_I8
#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) #elif defined(__gfx11__)
#define CK_USE_AMD_V_FMAC_F32 #define CK_USE_AMD_V_FMAC_F32
#define CK_USE_AMD_V_DOT2_F32_F16 #define CK_USE_AMD_V_DOT2_F32_F16
#define CK_USE_AMD_V_DOT4_I32_I8_GFX11 #define CK_USE_AMD_V_DOT4_I32_I8_GFX11
...@@ -75,23 +89,22 @@ ...@@ -75,23 +89,22 @@
// MFMA instruction // MFMA instruction
#ifndef __HIP_DEVICE_COMPILE__ // for host code #ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_USE_AMD_MFMA #define CK_USE_AMD_MFMA
#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \ #elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) // for GPU code
defined(__gfx942__) // for GPU code
#define CK_USE_AMD_MFMA #define CK_USE_AMD_MFMA
#endif #endif
#if(defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) #if(defined(__gfx90a__) || defined(__gfx94__))
#define CK_USE_AMD_MFMA_BF16_1K_OP #define CK_USE_AMD_MFMA_BF16_1K_OP
#endif #endif
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) #if defined(__gfx94__)
#define CK_USE_AMD_MFMA_GFX940 #define CK_USE_AMD_MFMA_GFX940
#endif #endif
// WMMA instruction // WMMA instruction
#ifndef __HIP_DEVICE_COMPILE__ // for host code #ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_USE_AMD_WMMA #define CK_USE_AMD_WMMA
#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code #elif defined(__gfx11__) // for GPU code
#define CK_USE_AMD_WMMA #define CK_USE_AMD_WMMA
#endif #endif
...@@ -107,15 +120,13 @@ ...@@ -107,15 +120,13 @@
// buffer atomic add: floating point // buffer atomic add: floating point
#ifndef __HIP_DEVICE_COMPILE__ // for host code #ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \ #elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) // for GPU code
defined(__gfx942__) // for GPU code
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
#else // for GPU code #else // for GPU code
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
#endif #endif
#if(defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \ #if(defined(__gfx90a__) || defined(__gfx94__)) // for GPU code
defined(__gfx942__)) // for GPU code
#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1 #define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
#else #else
#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0 #define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
...@@ -134,6 +145,9 @@ ...@@ -134,6 +145,9 @@
// inner product using V_DOT with DPP8 modifiers // inner product using V_DOT with DPP8 modifiers
#define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1 #define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1
// LDS direct loads using inline assembly
#define CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM 1
// set stochastic rounding as default for f8 conversions // set stochastic rounding as default for f8 conversions
#define CK_USE_SR_F8_CONVERSION 1 #define CK_USE_SR_F8_CONVERSION 1
...@@ -215,7 +229,7 @@ ...@@ -215,7 +229,7 @@
// denorm test fix, required to work around dissue // denorm test fix, required to work around dissue
#ifndef CK_WORKAROUND_DENORM_FIX #ifndef CK_WORKAROUND_DENORM_FIX
#define CK_WORKAROUND_DENORM_FIX 0 #define CK_WORKAROUND_DENORM_FIX 0
#elif #else
// enable only on MI200 // enable only on MI200
#define CK_WORKAROUND_DENORM_FIX = CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__) #define CK_WORKAROUND_DENORM_FIX = CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__)
#endif // CK_WORKAROUND_DENORM_FIX #endif // CK_WORKAROUND_DENORM_FIX
......
...@@ -26,7 +26,7 @@ inline std::string get_device_name() ...@@ -26,7 +26,7 @@ inline std::string get_device_name()
} }
const std::string raw_name(props.gcnArchName); const std::string raw_name(props.gcnArchName);
// https://github.com/ROCmSoftwarePlatform/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40 // https://github.com/ROCm/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
static std::map<std::string, std::string> device_name_map = { static std::map<std::string, std::string> device_name_map = {
{"Ellesmere", "gfx803"}, {"Ellesmere", "gfx803"},
{"Baffin", "gfx803"}, {"Baffin", "gfx803"},
...@@ -65,4 +65,23 @@ inline bool is_lds_direct_load_supported() ...@@ -65,4 +65,23 @@ inline bool is_lds_direct_load_supported()
ck::get_device_name() == "gfx941" || ck::get_device_name() == "gfx942"; ck::get_device_name() == "gfx941" || ck::get_device_name() == "gfx942";
} }
inline bool is_navi1_supported()
{
return ck::get_device_name() == "gfx1010" || ck::get_device_name() == "gfx1011" ||
ck::get_device_name() == "gfx1012";
}
inline bool is_navi2_supported()
{
return ck::get_device_name() == "gfx1030" || ck::get_device_name() == "gfx1031" ||
ck::get_device_name() == "gfx1032" || ck::get_device_name() == "gfx1034" ||
ck::get_device_name() == "gfx1035" || ck::get_device_name() == "gfx1036";
}
inline bool is_navi3_supported()
{
return ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
ck::get_device_name() == "gfx1102" || ck::get_device_name() == "gfx1103";
}
} // namespace ck } // namespace ck
...@@ -12,21 +12,23 @@ inline void hip_check_error(hipError_t x) ...@@ -12,21 +12,23 @@ inline void hip_check_error(hipError_t x)
if(x != hipSuccess) if(x != hipSuccess)
{ {
std::ostringstream ss; std::ostringstream ss;
ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__ ss << "HIP runtime error: " << hipGetErrorString(x) << ". "
<< "in function: " << __func__; << "hip_check_error.hpp"
<< ": " << __LINE__ << "in function: " << __func__;
throw std::runtime_error(ss.str()); throw std::runtime_error(ss.str());
} }
} }
#define HIP_CHECK_ERROR(retval_or_funcall) \ #define HIP_CHECK_ERROR(retval_or_funcall) \
do \ do \
{ \ { \
hipError_t _tmpVal = retval_or_funcall; \ hipError_t _tmpVal = retval_or_funcall; \
if(_tmpVal != hipSuccess) \ if(_tmpVal != hipSuccess) \
{ \ { \
std::ostringstream ostr; \ std::ostringstream ostr; \
ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \ ostr << "HIP Function Failed (" \
<< hipGetErrorString(_tmpVal); \ << "hip_check_error.hpp" \
throw std::runtime_error(ostr.str()); \ << "," << __LINE__ << ") " << hipGetErrorString(_tmpVal); \
} \ throw std::runtime_error(ostr.str()); \
} \
} while(0) } while(0)
...@@ -30,7 +30,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config, ...@@ -30,7 +30,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
block_dim.y, block_dim.y,
block_dim.z); block_dim.z);
printf("Warm up 1 time\n"); printf("Warm up %d times\n", stream_config.cold_niters_);
#endif #endif
// warm up // warm up
for(int i = 0; i < stream_config.cold_niters_; ++i) for(int i = 0; i < stream_config.cold_niters_; ++i)
...@@ -103,14 +103,17 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, ...@@ -103,14 +103,17 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
block_dim.y, block_dim.y,
block_dim.z); block_dim.z);
printf("Warm up 1 time\n"); printf("Warm up %d times\n", stream_config.cold_niters_);
#endif #endif
// warm up // warm up
preprocess(); preprocess();
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...); for(int i = 0; i < stream_config.cold_niters_; ++i)
hip_check_error(hipGetLastError()); {
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
hip_check_error(hipGetLastError());
}
const int nrepeat = 10; const int nrepeat = stream_config.nrepeat_;
#if DEBUG_LOG #if DEBUG_LOG
printf("Start running %d times...\n", nrepeat); printf("Start running %d times...\n", nrepeat);
#endif #endif
......
...@@ -11,6 +11,6 @@ struct StreamConfig ...@@ -11,6 +11,6 @@ struct StreamConfig
hipStream_t stream_id_ = nullptr; hipStream_t stream_id_ = nullptr;
bool time_kernel_ = false; bool time_kernel_ = false;
int log_level_ = 0; int log_level_ = 0;
int cold_niters_ = 1; int cold_niters_ = 5;
int nrepeat_ = 10; int nrepeat_ = 50;
}; };
...@@ -59,7 +59,9 @@ struct BaseOperator ...@@ -59,7 +59,9 @@ struct BaseOperator
virtual size_t GetWorkSpaceSize(const BaseArgument*) const { return 0; } virtual size_t GetWorkSpaceSize(const BaseArgument*) const { return 0; }
virtual void SetWorkSpacePointer(BaseArgument* p_arg, void* p_workspace) const virtual void SetWorkSpacePointer(BaseArgument* p_arg,
void* p_workspace,
const StreamConfig& = StreamConfig{}) const
{ {
assert(p_arg); assert(p_arg);
p_arg->p_workspace_ = p_workspace; p_arg->p_workspace_ = p_workspace;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <vector>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
template <typename DYDataType,
typename XDataType,
typename GammaDataType,
typename MeanInvStdDataType,
typename DXDataType,
index_t Rank,
index_t NumReduceDim>
struct DeviceNormalizationBwdData : public BaseOperator
{
virtual std::unique_ptr<BaseArgument>
MakeArgumentPointer(const std::vector<index_t> lengths,
const std::vector<index_t> dyStrides,
const std::vector<index_t> xStrides,
const std::vector<index_t> gammaStrides,
const std::vector<index_t> meanStrides,
const std::vector<index_t> invStdStrides,
const std::vector<index_t> dxStrides,
const std::vector<index_t> reduceDims,
const void* p_dy,
const void* p_x,
const void* p_gamma,
const void* p_mean,
const void* p_invStd,
void* p_dx) = 0;
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
};
template <typename DYDataType,
typename XDataType,
typename GammaDataType,
typename MeanInvStdDataType,
typename DXDataType,
index_t Rank,
index_t NumReduceDim>
using DeviceNormalizationBwdDataPtr = std::unique_ptr<DeviceNormalizationBwdData<DYDataType,
XDataType,
GammaDataType,
MeanInvStdDataType,
DXDataType,
Rank,
NumReduceDim>>;
} // namespace device
} // namespace tensor_operation
} // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment