Commit 982c3b60 authored by Jing Zhang's avatar Jing Zhang
Browse files

enable dynamic mode of conv and conv+resize_add

parent 1b79fce9
...@@ -57,6 +57,62 @@ __global__ void ...@@ -57,6 +57,62 @@ __global__ void
integral_constant<bool, HasMainE0BlockLoop>{}); integral_constant<bool, HasMainE0BlockLoop>{});
} }
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER #elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
// pass tensor descriptor by CONSTANT void pointer
// CONSTANT is needed to inform compiler void pointers in the kernel signature are pointing to
// non-modifiable parameter address space, so compiler can enable corresponding optimization
template <typename GridwiseGemm,
typename FloatAB,
typename FloatC,
typename AGridDesc_E0_E1_K0_K1_E2,
typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
kernel_gemm_dlops_v3(const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid,
const FloatC* __restrict__ p_bias_grid,
FloatC* __restrict__ p_c_grid,
const void CONSTANT* p_a_e0_e1_k0_k1_e2_grid_desc,
const void CONSTANT* p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
const void CONSTANT* p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
const void CONSTANT* p_c_blockid_to_k_n_h_w_block_cluster_adaptor)
{
// first cast void CONSTANT void* to void*
// second cast void* to Desc*
// the copy constructor of tensor descriptor doesn't take address_space(4)
const auto a_e0_e1_k0_k1_e2_grid_desc = *reinterpret_cast<const AGridDesc_E0_E1_K0_K1_E2*>(
cast_pointer_to_generic_address_space(p_a_e0_e1_k0_k1_e2_grid_desc));
const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
*reinterpret_cast<const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2*>(
cast_pointer_to_generic_address_space(p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc));
const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
*reinterpret_cast<const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2*>(
cast_pointer_to_generic_address_space(p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc));
const auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
*reinterpret_cast<const CBlockIdToBlockClusterAdaptor_K_N_H_W*>(
cast_pointer_to_generic_address_space(p_c_blockid_to_k_n_h_w_block_cluster_adaptor));
constexpr index_t shared_block_size =
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
__shared__ FloatAB p_shared_block[shared_block_size];
GridwiseGemm::ConvBiasActiv(p_a_grid,
p_b_grid,
p_bias_grid,
p_c_grid,
p_shared_block,
a_e0_e1_k0_k1_e2_grid_desc,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>{});
}
// pass tensor descriptor by CONSTANT void pointer // pass tensor descriptor by CONSTANT void pointer
// CONSTANT is needed to inform compiler void pointers in the kernel signature are pointing to // CONSTANT is needed to inform compiler void pointers in the kernel signature are pointing to
// non-modifiable parameter address space, so compiler can enable corresponding optimization // non-modifiable parameter address space, so compiler can enable corresponding optimization
...@@ -73,16 +129,16 @@ __global__ void ...@@ -73,16 +129,16 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif #endif
kernel_gemm_dlops_v2_add(const FloatAB* __restrict__ p_a_grid, kernel_gemm_dlops_v3_resize_add(
const FloatAB* __restrict__ p_b_grid, const FloatAB* __restrict__ p_a_grid,
const FloatC* __restrict__ p_bias_grid, const FloatAB* __restrict__ p_b_grid,
FloatC* __restrict__ p_c_grid, const FloatC* __restrict__ p_bias_grid,
FloatC* __restrict__ p_d_grid, FloatC* __restrict__ p_d_grid,
const void CONSTANT* p_a_e0_e1_k0_k1_e2_grid_desc, const void CONSTANT* p_a_e0_e1_k0_k1_e2_grid_desc,
const void CONSTANT* p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, const void CONSTANT* p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
const void CONSTANT* p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, const void CONSTANT* p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
const void CONSTANT* p_d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, const void CONSTANT* p_d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
const void CONSTANT* p_c_blockid_to_k_n_h_w_block_cluster_adaptor) const void CONSTANT* p_c_blockid_to_k_n_h_w_block_cluster_adaptor)
{ {
// first cast void CONSTANT void* to void* // first cast void CONSTANT void* to void*
// second cast void* to Desc* // second cast void* to Desc*
...@@ -107,18 +163,79 @@ __global__ void ...@@ -107,18 +163,79 @@ __global__ void
__shared__ FloatAB p_shared_block[shared_block_size]; __shared__ FloatAB p_shared_block[shared_block_size];
GridwiseGemm::Run(p_a_grid, GridwiseGemm::ConvBiasActivResizeAddRun(p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid, p_bias_grid,
p_c_grid, p_d_grid,
p_d_grid, p_shared_block,
p_shared_block, a_e0_e1_k0_k1_e2_grid_desc,
a_e0_e1_k0_k1_e2_grid_desc, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, c_blockid_to_k_n_h_w_block_cluster_adaptor,
c_blockid_to_k_n_h_w_block_cluster_adaptor, integral_constant<bool, HasMainE0BlockLoop>{});
integral_constant<bool, HasMainE0BlockLoop>{}); }
template <typename GridwiseGemm,
typename FloatAB,
typename FloatC,
typename AGridDesc_E0_E1_K0_K1_E2,
typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
kernel_gemm_dlops_v3_maxpool(
const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid,
const FloatC* __restrict__ p_bias_grid,
FloatC* __restrict__ p_c_grid,
FloatC* __restrict__ p_d_grid,
const void CONSTANT* p_a_e0_e1_k0_k1_e2_grid_desc,
const void CONSTANT* p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
const void CONSTANT* p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
const void CONSTANT* p_d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
const void CONSTANT* p_c_blockid_to_k_n_h_w_block_cluster_adaptor)
{
// first cast void CONSTANT void* to void*
// second cast void* to Desc*
// the copy constructor of tensor descriptor doesn't take address_space(4)
const auto a_e0_e1_k0_k1_e2_grid_desc = *reinterpret_cast<const AGridDesc_E0_E1_K0_K1_E2*>(
cast_pointer_to_generic_address_space(p_a_e0_e1_k0_k1_e2_grid_desc));
const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
*reinterpret_cast<const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2*>(
cast_pointer_to_generic_address_space(p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc));
const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
*reinterpret_cast<const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2*>(
cast_pointer_to_generic_address_space(p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc));
const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc =
*reinterpret_cast<const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx*>(
cast_pointer_to_generic_address_space(p_d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc));
const auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
*reinterpret_cast<const CBlockIdToBlockClusterAdaptor_K_N_H_W*>(
cast_pointer_to_generic_address_space(p_c_blockid_to_k_n_h_w_block_cluster_adaptor));
constexpr index_t shared_block_size =
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
__shared__ FloatAB p_shared_block[shared_block_size];
GridwiseGemm::ConvBiasActivMaxpoolRun(p_a_grid,
p_b_grid,
p_bias_grid,
p_c_grid,
p_d_grid,
p_shared_block,
a_e0_e1_k0_k1_e2_grid_desc,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>{});
} }
#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR #elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
template <typename GridwiseGemm, template <typename GridwiseGemm,
...@@ -134,7 +251,7 @@ __global__ void ...@@ -134,7 +251,7 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif #endif
kernel_gemm_dlops_v2_resize_add(const FloatAB* __restrict__ p_a_grid, kernel_gemm_dlops_v3_resize_add(const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid, const FloatAB* __restrict__ p_b_grid,
const FloatC* __restrict__ p_bias_grid, const FloatC* __restrict__ p_bias_grid,
FloatC* __restrict__ p_d_grid) FloatC* __restrict__ p_d_grid)
...@@ -178,7 +295,7 @@ __global__ void ...@@ -178,7 +295,7 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif #endif
kernel_gemm_dlops_v2_maxpool(const FloatAB* __restrict__ p_a_grid, kernel_gemm_dlops_v3_maxpool(const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid, const FloatAB* __restrict__ p_b_grid,
const FloatC* __restrict__ p_bias_grid, const FloatC* __restrict__ p_bias_grid,
FloatC* __restrict__ p_c_grid, FloatC* __restrict__ p_c_grid,
...@@ -223,7 +340,7 @@ __global__ void ...@@ -223,7 +340,7 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif #endif
kernel_gemm_dlops_v2(const FloatAB* __restrict__ p_a_grid, kernel_gemm_dlops_v3(const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid, const FloatAB* __restrict__ p_b_grid,
const FloatC* __restrict__ p_bias_grid, const FloatC* __restrict__ p_bias_grid,
FloatC* __restrict__ p_c_grid) FloatC* __restrict__ p_c_grid)
...@@ -517,11 +634,17 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -517,11 +634,17 @@ struct GridwiseGemmDlops_km_kn_mn_v3
const auto H2 = Number<HoPerThread * 2>{}; const auto H2 = Number<HoPerThread * 2>{};
const auto H1 = Number<HoPerBlock / HoPerThread>{}; const auto H1 = Number<HoPerBlock / HoPerThread>{};
const auto H0 = Number<Hx / (H1 * H2)>{};
const auto W2 = Number<WoPerThread * 2>{}; const auto W2 = Number<WoPerThread * 2>{};
const auto W1 = Number<WoPerBlock / WoPerThread>{}; const auto W1 = Number<WoPerBlock / WoPerThread>{};
#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
const auto H0 = Number<Hx / (H1 * H2)>{};
const auto W0 = Number<Wx / (W1 * W2)>{}; const auto W0 = Number<Wx / (W1 * W2)>{};
#else
const auto H0 = Hx / (H1 * H2);
const auto W0 = Wx / (W1 * W2);
#endif
const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = transform_tensor_descriptor( const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = transform_tensor_descriptor(
d_k_n_hx_wx_grid_desc, d_k_n_hx_wx_grid_desc,
......
...@@ -91,8 +91,8 @@ ...@@ -91,8 +91,8 @@
// pass tensor descriptor by value or void* // pass tensor descriptor by value or void*
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 0 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 1
#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 1 #define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0
// merge transformation use magic number division // merge transformation use magic number division
#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0 #define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
template <typename TInWei, template <typename TInWei,
typename TAcc, typename TAcc,
typename TOut, typename TOut,
ck::index_t activ_type, ck::ActivTypeEnum_t activ_type,
typename InLengths, typename InLengths,
typename WeiLengths, typename WeiLengths,
typename AddLengths, typename AddLengths,
...@@ -71,7 +71,7 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0 ...@@ -71,7 +71,7 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data()); bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
add_n_k0_hox2_wox2_k1_device_buf.ToDevice(add_n_k0_hox2_wox2_k1.mData.data()); add_n_k0_hox2_wox2_k1_device_buf.ToDevice(add_n_k0_hox2_wox2_k1.mData.data());
constexpr index_t InWeiVectorSize = C1; constexpr index_t InWeiVectorSize = 8;
if(C1 % InWeiVectorSize != 0) if(C1 % InWeiVectorSize != 0)
{ {
...@@ -106,16 +106,16 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0 ...@@ -106,16 +106,16 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
#elif 1 #elif 1
constexpr auto BlockSize = 64; constexpr auto BlockSize = 64;
constexpr auto KPerBlock = K; constexpr auto KPerBlock = 8;
constexpr auto HoPerBlock = 8; constexpr auto HoPerBlock = 8;
constexpr auto WoPerBlock = 32; constexpr auto WoPerBlock = 32;
constexpr auto E1 = C0 * 9; constexpr auto E1 = 2 * 9;
constexpr auto E2 = C1 / InWeiVectorSize; constexpr auto E2 = 1;
constexpr auto K2 = 2; constexpr auto K2 = 2;
constexpr auto E1PerBlock = C0; constexpr auto E1PerBlock = 2;
constexpr auto KPerThread = K; constexpr auto KPerThread = KPerBlock;
constexpr auto HoPerThread = 2; constexpr auto HoPerThread = 2;
constexpr auto WoPerThread = 2; constexpr auto WoPerThread = 2;
constexpr auto EPerThread = 1; constexpr auto EPerThread = 1;
...@@ -124,28 +124,21 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0 ...@@ -124,28 +124,21 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 =
Sequence<1, E1PerBlock, 1, KPerBlock, 1>; Sequence<1, E1PerBlock, 1, KPerBlock, 1>;
constexpr auto ABlockTransferSrcScalarPerVector_E2 = E2; constexpr auto ABlockTransferSrcScalarPerVector_E2 = E2;
constexpr auto ABlockTransferDstScalarPerVector_E2 = E2; constexpr auto ABlockTransferDstScalarPerVector_E2 = E2;
constexpr auto BThreadTransferSrcScalarPerVector_E2 = E2; constexpr auto BThreadTransferSrcScalarPerVector_E2 = E2;
constexpr auto CThreadTransferDstScalarPerVector_K = InWeiVectorSize;
constexpr auto CThreadTransferDstScalarPerVector_K = K1;
#endif #endif
const auto in_n_c0_hi_wi_c1_desc = const auto in_n_c0_hi_wi_c1_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, C1)); make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2));
const auto wei_k_c0_y_x_c1_desc = const auto wei_k_c0_y_x_c1_desc =
make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, C1)); make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, E2));
const auto add_n_k0_hox2_wox2_k1_desc = const auto add_n_k0_hox2_wox2_k1_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hox2, Wox2, K1)); make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hox2, Wox2, K1));
const auto out_n_k0_ho_wo_k1_desc = const auto out_n_k0_ho_wo_k1_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)); make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
static_assert(in_n_c0_hi_wi_c1_desc.IsKnownAtCompileTime(), "");
static_assert(wei_k_c0_y_x_c1_desc.IsKnownAtCompileTime(), "");
static_assert(add_n_k0_hox2_wox2_k1_desc.IsKnownAtCompileTime(), "");
static_assert(out_n_k0_ho_wo_k1_desc.IsKnownAtCompileTime(), "");
constexpr auto conv_driver = constexpr auto conv_driver =
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_add< DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_add<
BlockSize, BlockSize,
......
...@@ -63,7 +63,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1 ...@@ -63,7 +63,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data()); wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data()); bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
constexpr index_t InWeiVectorSize = C1; constexpr index_t InWeiVectorSize = 8;
if(C1 % InWeiVectorSize != 0) if(C1 % InWeiVectorSize != 0)
{ {
...@@ -98,30 +98,35 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1 ...@@ -98,30 +98,35 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
#elif 1 #elif 1
constexpr index_t BlockSize = 64; constexpr index_t BlockSize = 64;
constexpr index_t KPerBlock = K; constexpr index_t KPerBlock = 8;
constexpr index_t HoPerBlock = 8; constexpr index_t HoPerBlock = 8;
constexpr index_t WoPerBlock = 32; constexpr index_t WoPerBlock = 32;
constexpr index_t E1 = C0 * Y * X; constexpr index_t E1 = 2 * 9;
constexpr index_t E2 = C1 / InWeiVectorSize; constexpr index_t E2 = 1;
constexpr index_t K2 = 2; constexpr index_t K2 = 2;
constexpr index_t E1PerBlock = C0; constexpr index_t E1PerBlock = 2;
constexpr index_t KPerThread = K; constexpr index_t KPerThread = KPerBlock;
constexpr index_t HoPerThread = 2; constexpr index_t HoPerThread = 2;
constexpr index_t WoPerThread = 2; constexpr index_t WoPerThread = 2;
constexpr index_t EPerThread = 1; constexpr index_t EPerThread = 1;
using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, Y * X, 1, 1, E2>; using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>;
using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 =
Sequence<1, E1PerBlock, 1, KPerBlock, 1>; Sequence<1, E1PerBlock, 1, KPerBlock, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2; constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2; constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2; constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
constexpr index_t CThreadTransferDstScalarPerVector_K = K1; constexpr index_t CThreadTransferDstScalarPerVector_K = InWeiVectorSize;
#endif #endif
if(KPerThread % InWeiVectorSize != 0)
{
throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
}
const auto in_n_c0_hi_wi_c1_desc = const auto in_n_c0_hi_wi_c1_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)); make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2));
const auto wei_k_c0_y_x_c1_desc = const auto wei_k_c0_y_x_c1_desc =
......
...@@ -27,7 +27,7 @@ template <ck::index_t BlockSize, ...@@ -27,7 +27,7 @@ template <ck::index_t BlockSize,
ck::index_t ABlockTransferDstScalarPerVector_E2, ck::index_t ABlockTransferDstScalarPerVector_E2,
ck::index_t BThreadTransferSrcScalarPerVector_E2, ck::index_t BThreadTransferSrcScalarPerVector_E2,
ck::index_t CThreadTransferDstScalarPerVector_K, ck::index_t CThreadTransferDstScalarPerVector_K,
ck::index_t activ_type> ck::ActivTypeEnum_t activ_type>
struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_add struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_add
{ {
template <typename... Wei, template <typename... Wei,
...@@ -84,7 +84,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -84,7 +84,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
const auto ConvDilationH = conv_dilations[I0]; const auto ConvDilationH = conv_dilations[I0];
const auto ConvDilationW = conv_dilations[I1]; const auto ConvDilationW = conv_dilations[I1];
#if 1 #if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{}; const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{}; const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
#else #else
...@@ -95,6 +95,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -95,6 +95,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
const auto OutRightPadH = Hop - Ho; const auto OutRightPadH = Hop - Ho;
const auto OutRightPadW = Wop - Wo; const auto OutRightPadW = Wop - Wo;
const auto OutRightPadHx = OutRightPadH * 2;
const auto OutRightPadWx = OutRightPadW * 2;
const auto InLeftPadH = in_left_pads[I0]; const auto InLeftPadH = in_left_pads[I0];
const auto InLeftPadW = in_left_pads[I1]; const auto InLeftPadW = in_left_pads[I1];
...@@ -186,8 +189,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -186,8 +189,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hox2, Wox2, K1)), make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hox2, Wox2, K1)),
make_tuple(make_merge_transform(make_tuple(K0, K1)), make_tuple(make_merge_transform(make_tuple(K0, K1)),
make_pass_through_transform(N), make_pass_through_transform(N),
make_pad_transform(Hox2, I0, Number<OutRightPadH * 2>{}), make_pad_transform(Hox2, I0, OutRightPadHx),
make_pad_transform(Wox2, I0, Number<OutRightPadW * 2>{})), make_pad_transform(Wox2, I0, OutRightPadWx)),
make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
...@@ -247,6 +250,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -247,6 +250,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack = constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack =
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}; Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
// hack to control index calculation when iterating over c_k0_k1_n_h0_h1_h2_w0_w1_w2_global tensor
constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks = constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks =
make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{}, make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
...@@ -288,10 +292,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -288,10 +292,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
// clang-format on // clang-format on
static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(b_e0_e1_n_ho_wo_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(d_k_n_hopx2_wopx2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_k_n_hop_wop_grid_desc.IsKnownAtCompileTime(), "");
// GEMM // GEMM
using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3< using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3<
...@@ -452,7 +452,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -452,7 +452,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
if(has_main_e0_block_loop) if(has_main_e0_block_loop)
{ {
const auto kernel = kernel_gemm_dlops_v2_resize_add< const auto kernel = kernel_gemm_dlops_v3_resize_add<
GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
...@@ -486,7 +486,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -486,7 +486,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
} }
else else
{ {
const auto kernel = kernel_gemm_dlops_v2_resize_add< const auto kernel = kernel_gemm_dlops_v3_resize_add<
GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
......
...@@ -79,7 +79,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -79,7 +79,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
const auto ConvDilationH = conv_dilations[I0]; const auto ConvDilationH = conv_dilations[I0];
const auto ConvDilationW = conv_dilations[I1]; const auto ConvDilationW = conv_dilations[I1];
#if 1 #if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{}; const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{}; const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
#else #else
...@@ -254,10 +254,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -254,10 +254,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
// clang-format on // clang-format on
static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(b_e0_e1_n_ho_wo_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_k_n_hop_wop_grid_desc.IsKnownAtCompileTime(), "");
// GEMM // GEMM
using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3< using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3<
BlockSize, BlockSize,
...@@ -329,11 +325,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -329,11 +325,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
float ave_time = 0; float ave_time = 0;
static_assert(a_e0_e1_k0_k1_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE #if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
if(has_main_e0_block_loop) if(has_main_e0_block_loop)
...@@ -410,7 +401,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -410,7 +401,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
{ {
const auto kernel = const auto kernel =
kernel_gemm_dlops_v2<GridwiseGemm, kernel_gemm_dlops_v3<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>, remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
...@@ -442,7 +433,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -442,7 +433,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
{ {
const auto kernel = const auto kernel =
kernel_gemm_dlops_v2<GridwiseGemm, kernel_gemm_dlops_v3<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>, remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp" #include "device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
#define USE_DYNAMIC_MODE 0 #define USE_DYNAMIC_MODE 1
#define USE_CONV_FWD_V5R1_NCHWC 1 #define USE_CONV_FWD_V5R1_NCHWC 1
enum ConvForwardAlgo enum ConvForwardAlgo
...@@ -46,7 +46,7 @@ int main(int argc, char* argv[]) ...@@ -46,7 +46,7 @@ int main(int argc, char* argv[])
exit(1); exit(1);
} }
constexpr index_t activ_type = 0; constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1])); const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
const bool do_verification = std::stoi(argv[2]); const bool do_verification = std::stoi(argv[2]);
...@@ -78,6 +78,9 @@ int main(int argc, char* argv[]) ...@@ -78,6 +78,9 @@ int main(int argc, char* argv[])
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1; const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const auto Hox2 = Ho * 2;
const auto Wox2 = Wo * 2;
#else #else
// static mode // static mode
if(argc < 6) if(argc < 6)
...@@ -93,7 +96,7 @@ int main(int argc, char* argv[]) ...@@ -93,7 +96,7 @@ int main(int argc, char* argv[])
const bool do_log = std::stoi(argv[4]); const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]); const int nrepeat = std::stoi(argv[5]);
constexpr index_t activ_type = 1; constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
#if 0 #if 0
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp" #include "device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
#define USE_DYNAMIC_MODE 0 #define USE_DYNAMIC_MODE 1
#define USE_CONV_FWD_V5R1_NCHWC 1 #define USE_CONV_FWD_V5R1_NCHWC 1
enum ConvForwardAlgo enum ConvForwardAlgo
...@@ -45,7 +45,7 @@ int main(int argc, char* argv[]) ...@@ -45,7 +45,7 @@ int main(int argc, char* argv[])
exit(1); exit(1);
} }
constexpr index_t activ_type = 1; constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1])); const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
const bool do_verification = std::stoi(argv[2]); const bool do_verification = std::stoi(argv[2]);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment