"git@developer.sourcefind.cn:gaoqiong/composable_kernel.git" did not exist on "d68df25588d7462d970900ae7ae7514b1e82bbce"
Commit 95228cd7 authored by Jing Zhang's avatar Jing Zhang
Browse files

refactor for 1x1

parent bb7a8b28
...@@ -134,6 +134,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 ...@@ -134,6 +134,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
a_thread_buf; a_thread_buf;
constexpr auto threadwise_gemm = ThreadwiseGemmDlops_km_kn_mn_v3<FloatAB, constexpr auto threadwise_gemm = ThreadwiseGemmDlops_km_kn_mn_v3<FloatAB,
FloatAB,
FloatC, FloatC,
decltype(a_thread_mtx_), decltype(a_thread_mtx_),
decltype(b_thread_mtx_), decltype(b_thread_mtx_),
......
...@@ -139,10 +139,10 @@ template <index_t BlockSize, ...@@ -139,10 +139,10 @@ template <index_t BlockSize,
typename BGlobalMoveSliceWindowStepHacks> typename BGlobalMoveSliceWindowStepHacks>
struct GridwiseGemmDlops_km_kn_mn_v3 struct GridwiseGemmDlops_km_kn_mn_v3
{ {
static constexpr auto E = EPerBlock;
__host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
{ {
constexpr auto E = EPerBlock * 3 * 3;
constexpr auto max_lds_align = constexpr auto max_lds_align =
math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{}); math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
...@@ -181,8 +181,6 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -181,8 +181,6 @@ struct GridwiseGemmDlops_km_kn_mn_v3
auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>( auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize()); p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize());
constexpr auto E = EPerBlock * 3 * 3;
// const auto E = a_e_k_global_desc.GetLength(I0); // const auto E = a_e_k_global_desc.GetLength(I0);
// const auto K = a_e_k_global_desc.GetLength(I1); // const auto K = a_e_k_global_desc.GetLength(I1);
......
...@@ -9,20 +9,31 @@ namespace ck { ...@@ -9,20 +9,31 @@ namespace ck {
// C[M, N] += transpose(A[K, M]) * B[K, N] // C[M, N] += transpose(A[K, M]) * B[K, N]
// Element of matrix can be vectorized data // Element of matrix can be vectorized data
// Assume: // Assume:
// 1. ADesc, BDesc, CDesc are known at compile-time // 1. AThreadDesc_E_K, BThreadDesc_E_N_Ho_Wo, CThreadDesc_K_N_Ho_Wo are known at compile-time
// 2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time // 2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
template <typename FloatAB, template <typename FloatA,
typename FloatB,
typename FloatC, typename FloatC,
typename ADesc, typename AThreadDesc_E_K,
typename BDesc, typename BThreadDesc_E_N_Ho_Wo,
typename CDesc, typename CThreadDesc_K_N_Ho_Wo,
index_t H, index_t H,
index_t W, index_t W,
typename enable_if<ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() && typename enable_if<AThreadDesc_E_K::IsKnownAtCompileTime() &&
CDesc::IsKnownAtCompileTime(), BThreadDesc_E_N_Ho_Wo::IsKnownAtCompileTime() &&
CThreadDesc_K_N_Ho_Wo::IsKnownAtCompileTime(),
bool>::type = false> bool>::type = false>
struct ThreadwiseGemmDlops_km_kn_mn_v3 struct ThreadwiseGemmDlops_km_kn_mn_v3
{ {
__device__ ThreadwiseGemmDlops_km_kn_mn_v3()
{
static_assert(AThreadDesc_E_K::IsKnownAtCompileTime() &&
BThreadDesc_E_N_Ho_Wo::IsKnownAtCompileTime() &&
CThreadDesc_K_N_Ho_Wo::IsKnownAtCompileTime(),
"wrong! Desc should be known at compile-time");
}
template <typename ABuffer, template <typename ABuffer,
typename AOriginIdx, typename AOriginIdx,
typename BBuffer, typename BBuffer,
...@@ -37,8 +48,9 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3 ...@@ -37,8 +48,9 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
COriginIdx) COriginIdx)
{ {
static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() && static_assert(AThreadDesc_E_K::IsKnownAtCompileTime() &&
CDesc::IsKnownAtCompileTime(), BThreadDesc_E_N_Ho_Wo::IsKnownAtCompileTime() &&
CThreadDesc_K_N_Ho_Wo::IsKnownAtCompileTime(),
"wrong! Desc should be known at compile-time"); "wrong! Desc should be known at compile-time");
static_assert(is_known_at_compile_time<remove_cvref_t<AOriginIdx>>::value && static_assert(is_known_at_compile_time<remove_cvref_t<AOriginIdx>>::value &&
...@@ -47,16 +59,16 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3 ...@@ -47,16 +59,16 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
"wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time"); "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time");
static_assert( static_assert(
is_same<remove_cvref_t<typename ABuffer::type>, remove_cvref_t<FloatAB>>::value && is_same<remove_cvref_t<typename ABuffer::type>, remove_cvref_t<FloatA>>::value &&
is_same<remove_cvref_t<typename BBuffer::type>, remove_cvref_t<FloatAB>>::value && is_same<remove_cvref_t<typename BBuffer::type>, remove_cvref_t<FloatB>>::value &&
is_same<remove_cvref_t<typename CBuffer::type>, remove_cvref_t<FloatC>>::value && is_same<remove_cvref_t<typename CBuffer::type>, remove_cvref_t<FloatC>>::value &&
"wrong! inconsistent type"); "wrong! inconsistent type");
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto E = ADesc{}.GetLength(I0); constexpr auto E = AThreadDesc_E_K{}.GetLength(I0);
constexpr auto K = ADesc{}.GetLength(I1); constexpr auto K = AThreadDesc_E_K{}.GetLength(I1);
constexpr auto a_origin_idx = to_multi_index(AOriginIdx{}); constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
constexpr auto b_origin_idx = to_multi_index(BOriginIdx{}); constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
...@@ -65,28 +77,28 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3 ...@@ -65,28 +77,28 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
static_for<0, E, 1>{}([&](auto e) { static_for<0, E, 1>{}([&](auto e) {
static_for<0, K, 1>{}([&](auto k) { static_for<0, K, 1>{}([&](auto k) {
constexpr index_t a_offset = constexpr index_t a_offset =
ADesc{}.CalculateOffset(a_origin_idx + make_tuple(e, k)); AThreadDesc_E_K{}.CalculateOffset(a_origin_idx + make_tuple(e, k));
#if 0 #if 0
if constexpr(H == 2 && W == 2) if constexpr(H == 2 && W == 2)
{ {
constexpr index_t b_offset_0 = constexpr index_t b_offset_0 =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0)); BThreadDesc_E_N_Ho_Wo{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0));
constexpr index_t b_offset_1 = constexpr index_t b_offset_1 =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 1)); BThreadDesc_E_N_Ho_Wo{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 1));
constexpr index_t b_offset_2 = constexpr index_t b_offset_2 =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0)); BThreadDesc_E_N_Ho_Wo{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0));
constexpr index_t b_offset_3 = constexpr index_t b_offset_3 =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 1)); BThreadDesc_E_N_Ho_Wo{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 1));
constexpr index_t c_offset_0 = constexpr index_t c_offset_0 =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0)); CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0));
constexpr index_t c_offset_1 = constexpr index_t c_offset_1 =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 1)); CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 1));
constexpr index_t c_offset_2 = constexpr index_t c_offset_2 =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0)); CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0));
constexpr index_t c_offset_3 = constexpr index_t c_offset_3 =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 1)); CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 1));
amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}], amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
b_buf[Number<b_offset_0>{}], b_buf[Number<b_offset_0>{}],
...@@ -101,22 +113,22 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3 ...@@ -101,22 +113,22 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
else if constexpr(H == 4 && W == 1) else if constexpr(H == 4 && W == 1)
{ {
constexpr index_t b_offset_0 = constexpr index_t b_offset_0 =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0)); BThreadDesc_E_N_Ho_Wo{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0));
constexpr index_t b_offset_1 = constexpr index_t b_offset_1 =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0)); BThreadDesc_E_N_Ho_Wo{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0));
constexpr index_t b_offset_2 = constexpr index_t b_offset_2 =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 2, 0)); BThreadDesc_E_N_Ho_Wo{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 2, 0));
constexpr index_t b_offset_3 = constexpr index_t b_offset_3 =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 3, 0)); BThreadDesc_E_N_Ho_Wo{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 3, 0));
constexpr index_t c_offset_0 = constexpr index_t c_offset_0 =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0)); CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0));
constexpr index_t c_offset_1 = constexpr index_t c_offset_1 =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0)); CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0));
constexpr index_t c_offset_2 = constexpr index_t c_offset_2 =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 2, 0)); CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 2, 0));
constexpr index_t c_offset_3 = constexpr index_t c_offset_3 =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 3, 0)); CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 3, 0));
amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}], amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
b_buf[Number<b_offset_0>{}], b_buf[Number<b_offset_0>{}],
...@@ -133,21 +145,14 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3 ...@@ -133,21 +145,14 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
{ {
static_for<0, H, 1>{}([&](auto h) { static_for<0, H, 1>{}([&](auto h) {
static_for<0, W, 1>{}([&](auto w) { static_for<0, W, 1>{}([&](auto w) {
constexpr index_t b_offset = constexpr index_t b_offset = BThreadDesc_E_N_Ho_Wo{}.CalculateOffset(
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, h, w)); b_origin_idx + make_tuple(e, 0, h, w));
constexpr index_t c_offset = constexpr index_t c_offset = CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, h, w)); c_origin_idx + make_tuple(k, 0, h, w));
#if 1 c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(
// c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}( a_buf[Number<a_offset>{}], b_buf[Number<b_offset>{}]);
// a_buf[Number<a_offset>{}], b_buf[Number<b_offset>{}]);
c_buf(Number<c_offset>{}) = a_buf[Number<a_offset>{}];
#else
inner_product(a_buf[Number<a_offset>{}],
b_buf[Number<b_offset>{}],
c_buf(Number<c_offset>{}));
#endif
}); });
}); });
} }
......
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp" #include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp"
template <typename TInWei, template <typename TInWei,
...@@ -50,11 +49,19 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( ...@@ -50,11 +49,19 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
const auto Y = wei_k_c_y_x_lengths[I2]; const auto Y = wei_k_c_y_x_lengths[I2];
const auto X = wei_k_c_y_x_lengths[I3]; const auto X = wei_k_c_y_x_lengths[I3];
#if 0
const auto C0 = C / Number<InWeiVectorSize>{}; const auto C0 = C / Number<InWeiVectorSize>{};
const auto C1 = Number<InWeiVectorSize>{}; const auto C1 = Number<InWeiVectorSize>{};
const auto K0 = K / Number<InWeiVectorSize>{}; const auto K0 = K / Number<InWeiVectorSize>{};
const auto K1 = Number<InWeiVectorSize>{}; const auto K1 = Number<InWeiVectorSize>{};
#else
const auto C0 = 1;
const auto C1 = C;
const auto K0 = 1;
const auto K1 = K;
#endif
Tensor<TInWei> in_n_c0_hi_wi_c1( Tensor<TInWei> in_n_c0_hi_wi_c1(
HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1})); HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1}));
...@@ -64,13 +71,11 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( ...@@ -64,13 +71,11 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
HostTensorDescriptor(std::initializer_list<index_t>{N, K0, Ho, Wo, K1})); HostTensorDescriptor(std::initializer_list<index_t>{N, K0, Ho, Wo, K1}));
auto f_nchw2nc0hwc1 = [&](auto n, auto hi, auto wi, auto c) { auto f_nchw2nc0hwc1 = [&](auto n, auto hi, auto wi, auto c) {
in_n_c0_hi_wi_c1(n, c / InWeiVectorSize, hi, wi, c % InWeiVectorSize) = in_n_c0_hi_wi_c1(n, c / C1, hi, wi, c % C1) = in_n_c_hi_wi(n, c, hi, wi);
in_n_c_hi_wi(n, c, hi, wi);
}; };
auto f_kcyx2kc0yxc1 = [&](auto k, auto y, auto x, auto c) { auto f_kcyx2kc0yxc1 = [&](auto k, auto y, auto x, auto c) {
wei_k_c0_y_x_c1(k, c / InWeiVectorSize, y, x, c % InWeiVectorSize) = wei_k_c0_y_x_c1(k, c / C1, y, x, c % C1) = wei_k_c_y_x(k, c, y, x);
wei_k_c_y_x(k, c, y, x);
}; };
make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)(); make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)();
...@@ -99,50 +104,44 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( ...@@ -99,50 +104,44 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
constexpr index_t KPerBlock = 16; constexpr index_t KPerBlock = 16;
constexpr index_t HoPerBlock = 8; constexpr index_t HoPerBlock = 8;
constexpr index_t WoPerBlock = 8; constexpr index_t WoPerBlock = 8;
constexpr index_t EPerBlock = 1; constexpr index_t EPerBlock = 16;
constexpr index_t KPerThread = KPerBlock; constexpr index_t KPerThread = KPerBlock;
constexpr index_t HoPerThread = 1; constexpr index_t HoPerThread = 1;
constexpr index_t WoPerThread = 1; constexpr index_t WoPerThread = 1;
constexpr index_t EPerThread = EPerBlock; constexpr index_t EPerThread = EPerBlock;
using ABlockTransferThreadSliceLengths_E_K = Sequence<9, 1>; using ABlockTransferThreadSliceLengths_E_K = Sequence<4, 1>;
using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, KPerBlock>; using ABlockTransferThreadClusterLengths_E_K = Sequence<4, 16>;
constexpr index_t ABlockTransferSrcScalarPerVector_E = 1; constexpr index_t ABlockTransferSrcScalarPerVector_E = 4;
constexpr index_t ABlockTransferDstScalarPerVector_K = 1; constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
constexpr index_t BThreadTransferSrcScalarPerVector_W = 1; constexpr index_t BThreadTransferSrcScalarPerVector_E = 4;
constexpr index_t CThreadTransferDstScalarPerVector_W = 1; constexpr index_t CThreadTransferDstScalarPerVector_K = 4;
static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
#endif #endif
constexpr auto conv_driver = constexpr auto conv_driver =
#if 0 DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad<
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad BlockSize,
#else TInWei,
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad TAcc,
#endif TOut,
<BlockSize, KPerBlock,
TInWei, HoPerBlock,
TAcc, WoPerBlock,
TOut, EPerBlock,
KPerBlock, KPerThread,
HoPerBlock, HoPerThread,
WoPerBlock, WoPerThread,
EPerBlock, EPerThread,
KPerThread, ABlockTransferThreadSliceLengths_E_K,
HoPerThread, ABlockTransferThreadClusterLengths_E_K,
WoPerThread, ABlockTransferSrcScalarPerVector_E,
EPerThread, ABlockTransferDstScalarPerVector_K,
ABlockTransferThreadSliceLengths_E_K, BThreadTransferSrcScalarPerVector_E,
ABlockTransferThreadClusterLengths_E_K, CThreadTransferDstScalarPerVector_K>{};
ABlockTransferSrcScalarPerVector_E,
ABlockTransferDstScalarPerVector_K,
BThreadTransferSrcScalarPerVector_W,
CThreadTransferDstScalarPerVector_W>{};
conv_driver.Run(wei_k_c0_y_x_c1_desc, conv_driver.Run(wei_k_c0_y_x_c1_desc,
in_n_c0_hi_wi_c1_desc, in_n_c0_hi_wi_c1_desc,
...@@ -158,8 +157,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( ...@@ -158,8 +157,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data()); out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
auto f_nk0hwk1_to_nkhw = [&](auto n, auto k, auto ho, auto wo) { auto f_nk0hwk1_to_nkhw = [&](auto n, auto k, auto ho, auto wo) {
out_n_k_ho_wo(n, k, ho, wo) = out_n_k_ho_wo(n, k, ho, wo) = out_n_k0_ho_wo_k1(n, k / K1, ho, wo, k % K1);
out_n_k0_ho_wo_k1(n, k / InWeiVectorSize, ho, wo, k % InWeiVectorSize);
}; };
make_ParallelTensorFunctor(f_nk0hwk1_to_nkhw, N, K, Ho, Wo)(); make_ParallelTensorFunctor(f_nk0hwk1_to_nkhw, N, K, Ho, Wo)();
......
...@@ -22,8 +22,8 @@ template <ck::index_t BlockSize, ...@@ -22,8 +22,8 @@ template <ck::index_t BlockSize,
typename ABlockTransferThreadClusterLengths_E_K, typename ABlockTransferThreadClusterLengths_E_K,
ck::index_t ABlockTransferSrcScalarPerVector_E, ck::index_t ABlockTransferSrcScalarPerVector_E,
ck::index_t ABlockTransferDstScalarPerVector_K, ck::index_t ABlockTransferDstScalarPerVector_K,
ck::index_t BThreadTransferSrcScalarPerVector_W, ck::index_t BThreadTransferSrcScalarPerVector_E,
ck::index_t CThreadTransferDstScalarPerVector_W> ck::index_t CThreadTransferDstScalarPerVector_K>
struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad
{ {
template <typename... Wei, template <typename... Wei,
...@@ -160,26 +160,26 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -160,26 +160,26 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
constexpr auto a_e_k_global_move_slice_window_step_hack = Sequence<0, 0, 0>{}; constexpr auto a_e_k_global_move_slice_window_step_hack = Sequence<0, 0, 0>{};
constexpr auto b_e_n_ho_wo_global_step_hacks = constexpr auto b_e_n_ho_wo_global_step_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}, make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}, make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack = constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack =
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}; Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format // hack for NKHW format
constexpr auto c_k_n_ho_wo_global_tensor_step_hacks = constexpr auto c_k_n_ho_wo_global_tensor_step_hacks =
make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{}, make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}), Sequence<0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 2, 0, 0, 0>{}, make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{})); Sequence<0, 0, 0, 0, 0>{}));
...@@ -211,13 +211,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -211,13 +211,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
ABlockTransferDstScalarPerVector_K, ABlockTransferDstScalarPerVector_K,
false, // don't move back src coordinate after threadwise copy false, // don't move back src coordinate after threadwise copy
Sequence<0, 2, 3, 1>, Sequence<0, 2, 3, 1>,
3, 0,
BThreadTransferSrcScalarPerVector_W, BThreadTransferSrcScalarPerVector_E,
false, // don't move back src coordinate after threadwise copy, which will be fused with false, // don't move back src coordinate after threadwise copy, which will be fused with
// MoveSrcSliceWindow() to save addr computation // MoveSrcSliceWindow() to save addr computation
Sequence<0, 2, 3, 1>, Sequence<0, 2, 3, 1>,
0, 0,
CThreadTransferDstScalarPerVector_W, CThreadTransferDstScalarPerVector_K,
decltype(a_e_k_global_step_hacks), decltype(a_e_k_global_step_hacks),
decltype(b_e_n_ho_wo_global_step_hacks), decltype(b_e_n_ho_wo_global_step_hacks),
decltype(c_k_n_ho_wo_global_tensor_step_hacks), decltype(c_k_n_ho_wo_global_tensor_step_hacks),
......
...@@ -11,7 +11,7 @@ cmake ...@@ -11,7 +11,7 @@ cmake
-D HALF_INCLUDE_DIR="/root/workspace/external/half/include" \ -D HALF_INCLUDE_DIR="/root/workspace/external/half/include" \
-D BUILD_DEV=OFF \ -D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX1030 -O3 --amdgpu-target=gfx1030 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD" \ -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX906 -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
......
...@@ -51,7 +51,8 @@ REPEAT=$6 ...@@ -51,7 +51,8 @@ REPEAT=$6
#./host/driver_online/conv_fwd_driver_online $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1 #./host/driver_online/conv_fwd_driver_online $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1
./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 1 16 16 3 3 8 8 1 1 1 1 1 1 1 1 #./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 1 16 16 3 3 8 8 1 1 1 1 1 1 1 1
./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 1 16 16 1 1 8 8 1 1 1 1 0 0 0 0
################################################ layout algo verify init log repeat M___ N___ K___ ################################################ layout algo verify init log repeat M___ N___ K___
#./host/driver_offline/gemm_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 960 1024 1024 #./host/driver_offline/gemm_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 960 1024 1024
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment