"docs/vscode:/vscode.git/clone" did not exist on "2d0b184bb615f7720dc40559e2b07cd1cbe7a95b"
Commit 26c42b94 authored by Jing Zhang's avatar Jing Zhang
Browse files

conv-only use v5r1_add

parent 4eb9a7a4
...@@ -133,11 +133,56 @@ __global__ void ...@@ -133,11 +133,56 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif #endif
kernel_gemm_dlops_v2_add(const FloatAB* __restrict__ p_a_grid, kernel_gemm_dlops_v2_resize_add(const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid, const FloatAB* __restrict__ p_b_grid,
const FloatC* __restrict__ p_bias_grid, const FloatC* __restrict__ p_bias_grid,
FloatC* __restrict__ p_c_grid, FloatC* __restrict__ p_d_grid)
FloatC* __restrict__ p_d_grid) {
constexpr index_t shared_block_size =
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
__shared__ FloatAB p_shared_block[shared_block_size];
constexpr auto a_e0_e1_k0_k1_e2_grid_desc = AGridDesc_E0_E1_K0_K1_E2{};
constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2{};
constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2{};
constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx{};
constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
CBlockIdToBlockClusterAdaptor_K_N_H_W{};
GridwiseGemm::Run(p_a_grid,
p_b_grid,
p_bias_grid,
nullptr,
p_d_grid,
p_shared_block,
a_e0_e1_k0_k1_e2_grid_desc,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>{});
}
template <typename GridwiseGemm,
typename FloatAB,
typename FloatC,
typename AGridDesc_E0_E1_K0_K1_E2,
typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
kernel_gemm_dlops_v2_maxpool(const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid,
const FloatC* __restrict__ p_bias_grid,
FloatC* __restrict__ p_c_grid,
FloatC* __restrict__ p_d_grid)
{ {
constexpr index_t shared_block_size = constexpr index_t shared_block_size =
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
...@@ -165,6 +210,49 @@ __global__ void ...@@ -165,6 +210,49 @@ __global__ void
c_blockid_to_k_n_h_w_block_cluster_adaptor, c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>{}); integral_constant<bool, HasMainE0BlockLoop>{});
} }
template <typename GridwiseGemm,
typename FloatAB,
typename FloatC,
typename AGridDesc_E0_E1_K0_K1_E2,
typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
kernel_gemm_dlops_v2(const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid,
const FloatC* __restrict__ p_bias_grid,
FloatC* __restrict__ p_c_grid)
{
constexpr index_t shared_block_size =
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
__shared__ FloatAB p_shared_block[shared_block_size];
constexpr auto a_e0_e1_k0_k1_e2_grid_desc = AGridDesc_E0_E1_K0_K1_E2{};
constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2{};
constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2{};
constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
CBlockIdToBlockClusterAdaptor_K_N_H_W{};
GridwiseGemm::Run(p_a_grid,
p_b_grid,
p_bias_grid,
p_c_grid,
nullptr,
p_shared_block,
a_e0_e1_k0_k1_e2_grid_desc,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>{});
}
#endif #endif
template <index_t BlockSize, template <index_t BlockSize,
...@@ -209,6 +297,8 @@ template <index_t BlockSize, ...@@ -209,6 +297,8 @@ template <index_t BlockSize,
typename AGlobalMoveSliceWindowStepHacks, typename AGlobalMoveSliceWindowStepHacks,
typename BGlobalMoveSliceWindowStepHacks, typename BGlobalMoveSliceWindowStepHacks,
index_t activ_type = 0, index_t activ_type = 0,
index_t bias_type = 0,
index_t out_type = 1,
index_t add_type = 0> index_t add_type = 0>
struct GridwiseGemmDlops_km_kn_mn_v3_add struct GridwiseGemmDlops_km_kn_mn_v3_add
{ {
...@@ -426,13 +516,13 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add ...@@ -426,13 +516,13 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
const auto K1 = Number<KPerBlock>{}; const auto K1 = Number<KPerBlock>{};
const auto K0 = K / K1; const auto K0 = K / K1;
const auto H2 = HoPerThread * 2; const auto H2 = Number<HoPerThread * 2>{};
const auto H1 = Number<HoPerBlock / HoPerThread>{}; const auto H1 = Number<HoPerBlock / HoPerThread>{};
const auto H0 = Hx / (H1 * H2); const auto H0 = Number<Hx / (H1 * H2)>{};
const auto W2 = WoPerThread * 2; const auto W2 = Number<WoPerThread * 2>{};
const auto W1 = Number<WoPerBlock / WoPerThread>{}; const auto W1 = Number<WoPerBlock / WoPerThread>{};
const auto W0 = Wx / (W1 * W2); const auto W0 = Number<Wx / (W1 * W2)>{};
const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = transform_tensor_descriptor( const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = transform_tensor_descriptor(
d_k_n_hx_wx_grid_desc, d_k_n_hx_wx_grid_desc,
...@@ -449,11 +539,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add ...@@ -449,11 +539,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDK0K1NH0H1HxW0W1WxGridDescriptor(const DGridDesc_K_N_Hx_Wx& d_k_n_hx_wx_grid_desc) MakeDK0K1NH0H1HxW0W1WxGridDescriptor(const DGridDesc_K_N_Hx_Wx& d_k_n_hx_wx_grid_desc)
{ {
if constexpr(add_type == 0) if constexpr(add_type == 1)
{ {
return MakeDK0K1NH0H1HxW0W1WxGridDescriptorResizeAdd(d_k_n_hx_wx_grid_desc); return MakeDK0K1NH0H1HxW0W1WxGridDescriptorResizeAdd(d_k_n_hx_wx_grid_desc);
} }
else if constexpr(add_type == 1) else if constexpr(add_type == 2)
{ {
return MakeDK0K1NH0H1HxW0W1WxGridDescriptorMaxPool(d_k_n_hx_wx_grid_desc); return MakeDK0K1NH0H1HxW0W1WxGridDescriptorMaxPool(d_k_n_hx_wx_grid_desc);
} }
...@@ -544,9 +634,6 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add ...@@ -544,9 +634,6 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
constexpr auto HasMainE1BlockLoop = CalculateHasMainE1BlockLoop(); constexpr auto HasMainE1BlockLoop = CalculateHasMainE1BlockLoop();
constexpr auto HasDoubleTailE1BlockLoop = CalculateHasDoubleTailE1BlockLoop(); constexpr auto HasDoubleTailE1BlockLoop = CalculateHasDoubleTailE1BlockLoop();
// const auto Ho = b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetLength(I3);
// const auto Wo = b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetLength(I4);
const auto c_k_n_h_w_block_cluster_idx = const auto c_k_n_h_w_block_cluster_idx =
c_blockid_to_k_n_h_w_block_cluster_adaptor.CalculateBottomIndex( c_blockid_to_k_n_h_w_block_cluster_adaptor.CalculateBottomIndex(
make_multi_index(get_block_1d_id())); make_multi_index(get_block_1d_id()));
...@@ -909,6 +996,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add ...@@ -909,6 +996,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
} }
// activ // activ
if constexpr(activ_type > 0)
{ {
static_for<0, c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(), 1>{}([&](auto i) { static_for<0, c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(), 1>{}([&](auto i) {
if constexpr(activ_type == 1) if constexpr(activ_type == 1)
...@@ -929,7 +1017,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add ...@@ -929,7 +1017,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
}); });
} }
// Bias if constexpr(bias_type == 1)
{ {
constexpr auto bias_k0_k1_thread_desc = constexpr auto bias_k0_k1_thread_desc =
make_naive_tensor_descriptor_packed(make_tuple(I1, Number<KPerThread>{})); make_naive_tensor_descriptor_packed(make_tuple(I1, Number<KPerThread>{}));
...@@ -982,6 +1070,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add ...@@ -982,6 +1070,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
#if 1 #if 1
// Output // Output
if constexpr(out_type == 1)
{ {
// hack to control index calculation when iterating over c_k_n_h0_h1_h2_w0_w1_w2_global // hack to control index calculation when iterating over c_k_n_h0_h1_h2_w0_w1_w2_global
// tensor // tensor
...@@ -1032,7 +1121,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add ...@@ -1032,7 +1121,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
#if 1 #if 1
// Resize_Add // Resize_Add
if constexpr(add_type == 0) if constexpr(add_type == 1)
{ {
constexpr auto HoPerThreadx2 = HoPerThread * 2; constexpr auto HoPerThreadx2 = HoPerThread * 2;
constexpr auto WoPerThreadx2 = WoPerThread * 2; constexpr auto WoPerThreadx2 = WoPerThread * 2;
...@@ -1145,7 +1234,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add ...@@ -1145,7 +1234,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks); d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks);
} }
// MaxPool // MaxPool
else if constexpr(add_type == 1) else if constexpr(add_type == 2)
{ {
static_assert(HoPerThread % 2 == 0 && WoPerThread % 2 == 0, ""); static_assert(HoPerThread % 2 == 0 && WoPerThread % 2 == 0, "");
......
...@@ -27,7 +27,6 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0 ...@@ -27,7 +27,6 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
const Tensor<TInWei>& in_n_c0_hi_wi_c1, const Tensor<TInWei>& in_n_c0_hi_wi_c1,
const Tensor<TInWei>& wei_k_c0_y_x_c1, const Tensor<TInWei>& wei_k_c0_y_x_c1,
const Tensor<TOut>& bias_k0_k1, const Tensor<TOut>& bias_k0_k1,
Tensor<TOut>& out_n_k0_ho_wo_k1,
const Tensor<TOut>& add_n_k0_hox2_wox2_k1, const Tensor<TOut>& add_n_k0_hox2_wox2_k1,
Tensor<TOut>& add_n_k0_hox2_wox2_k1_out, Tensor<TOut>& add_n_k0_hox2_wox2_k1_out,
ck::index_t nrepeat) ck::index_t nrepeat)
...@@ -64,8 +63,6 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0 ...@@ -64,8 +63,6 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
in_n_c0_hi_wi_c1.mDesc.GetElementSpace()); in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace()); DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace()); DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
DeviceMem add_n_k0_hox2_wox2_k1_device_buf(sizeof(TOut) * DeviceMem add_n_k0_hox2_wox2_k1_device_buf(sizeof(TOut) *
add_n_k0_hox2_wox2_k1.mDesc.GetElementSpace()); add_n_k0_hox2_wox2_k1.mDesc.GetElementSpace());
...@@ -191,7 +188,6 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0 ...@@ -191,7 +188,6 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>( static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()), in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()), static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()), static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()),
nrepeat); nrepeat);
...@@ -219,10 +215,8 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0 ...@@ -219,10 +215,8 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>( static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()), in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()), static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()), static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()),
0); 0);
out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
add_n_k0_hox2_wox2_k1_device_buf.FromDevice(add_n_k0_hox2_wox2_k1_out.mData.data()); add_n_k0_hox2_wox2_k1_device_buf.FromDevice(add_n_k0_hox2_wox2_k1_out.mData.data());
} }
...@@ -132,10 +132,6 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1 ...@@ -132,10 +132,6 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
const auto out_n_k0_ho_wo_k1_desc = const auto out_n_k0_ho_wo_k1_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)); make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
static_assert(in_n_c0_hi_wi_c1_desc.IsKnownAtCompileTime(), "");
static_assert(wei_k_c0_y_x_c1_desc.IsKnownAtCompileTime(), "");
static_assert(out_n_k0_ho_wo_k1_desc.IsKnownAtCompileTime(), "");
constexpr auto conv_driver = constexpr auto conv_driver =
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad< DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad<
BlockSize, BlockSize,
......
...@@ -49,7 +49,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -49,7 +49,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
const FloatAB* __restrict__ p_a_grid, const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid, const FloatAB* __restrict__ p_b_grid,
const FloatC* __restrict__ p_bias_grid, const FloatC* __restrict__ p_bias_grid,
FloatC* __restrict__ p_c_grid,
FloatC* __restrict__ p_d_grid, FloatC* __restrict__ p_d_grid,
const int nrepeat) const const int nrepeat) const
{ {
...@@ -338,7 +337,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -338,7 +337,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
decltype(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_global_tensor_step_hacks), decltype(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_global_tensor_step_hacks),
decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack), decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack), decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack),
activ_type>; activ_type,
1, // bias_type
0, // out_type
1 // add_type
>;
const auto a_e0_e1_k0_k1_e2_grid_desc = const auto a_e0_e1_k0_k1_e2_grid_desc =
GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc); GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
...@@ -374,7 +377,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -374,7 +377,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
if(has_main_e0_block_loop) if(has_main_e0_block_loop)
{ {
const auto kernel = kernel_gemm_dlops_v2_add< const auto kernel = kernel_gemm_dlops_v2_resize_add<
GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
...@@ -393,7 +396,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -393,7 +396,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid, p_bias_grid,
p_c_grid,
p_d_grid, p_d_grid,
a_e0_e1_k0_k1_e2_grid_desc, a_e0_e1_k0_k1_e2_grid_desc,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
...@@ -403,7 +405,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -403,7 +405,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
} }
else else
{ {
const auto kernel = kernel_gemm_dlops_v2_add< const auto kernel = kernel_gemm_dlops_v2_resize_add<
GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
...@@ -422,7 +424,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -422,7 +424,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid, p_bias_grid,
p_c_grid,
p_d_grid, p_d_grid,
a_e0_e1_k0_k1_e2_grid_desc, a_e0_e1_k0_k1_e2_grid_desc,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
...@@ -455,7 +456,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -455,7 +456,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
if(has_main_e0_block_loop) if(has_main_e0_block_loop)
{ {
const auto kernel = kernel_gemm_dlops_v2_add< const auto kernel = kernel_gemm_dlops_v2_resize_add<
GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
...@@ -475,7 +476,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -475,7 +476,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid, p_bias_grid,
p_c_grid,
p_d_grid, p_d_grid,
cast_pointer_to_constant_address_space( cast_pointer_to_constant_address_space(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()), a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
...@@ -490,8 +490,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -490,8 +490,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
} }
else else
{ {
const auto kernel = kernel_gemm_dlops_v2_resize_add<
const auto kernel = kernel_gemm_dlops_v2_add<
GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
...@@ -511,7 +510,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -511,7 +510,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid, p_bias_grid,
p_c_grid,
p_d_grid, p_d_grid,
cast_pointer_to_constant_address_space( cast_pointer_to_constant_address_space(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()), a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
...@@ -524,7 +522,35 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -524,7 +522,35 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
cast_pointer_to_constant_address_space( cast_pointer_to_constant_address_space(
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer())); c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
} }
#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
{
static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
const auto kernel = kernel_gemm_dlops_v2_resize_add<
GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
has_main_e0_block_loop>;
ave_time = launch_and_time_kernel(kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_bias_grid,
p_d_grid);
}
#endif #endif
return ave_time; return ave_time;
} }
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
#include "common_header.hpp" #include "common_header.hpp"
#include "tensor_descriptor.hpp" #include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp" #include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_dlops_v2.hpp" #include "gridwise_gemm_dlops_v2_add.hpp"
template <ck::index_t BlockSize, template <ck::index_t BlockSize,
typename FloatAB, typename FloatAB,
...@@ -259,7 +259,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -259,7 +259,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
static_assert(c_k_n_hop_wop_grid_desc.IsKnownAtCompileTime(), ""); static_assert(c_k_n_hop_wop_grid_desc.IsKnownAtCompileTime(), "");
// GEMM // GEMM
using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3< using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3_add<
BlockSize, BlockSize,
FloatAB, FloatAB,
FloatAcc, FloatAcc,
...@@ -268,6 +268,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -268,6 +268,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
decltype(a_e0_e1_k_e2_grid_desc), decltype(a_e0_e1_k_e2_grid_desc),
decltype(b_e0_e1_n_ho_wo_e2_grid_desc), decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
decltype(c_k_n_hop_wop_grid_desc), decltype(c_k_n_hop_wop_grid_desc),
decltype(c_k_n_hop_wop_grid_desc),
E1, E1,
E2, E2,
K2, K2,
...@@ -298,9 +299,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -298,9 +299,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
decltype(a_e0_e1_k_e2_global_step_hacks), decltype(a_e0_e1_k_e2_global_step_hacks),
decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks), decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks),
decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks), decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack), decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack), decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack),
activ_type>; activ_type,
0, // bias_type
1, // out_type
0 // add_type
>;
const auto a_e0_e1_k0_k1_e2_grid_desc = const auto a_e0_e1_k0_k1_e2_grid_desc =
GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc); GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
...@@ -469,7 +475,33 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -469,7 +475,33 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
cast_pointer_to_constant_address_space( cast_pointer_to_constant_address_space(
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer())); c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
} }
#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
{
static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
const auto kernel =
kernel_gemm_dlops_v2<GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
has_main_e0_block_loop>;
ave_time = launch_and_time_kernel(kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_bias_grid,
p_c_grid);
}
#endif #endif
return ave_time; return ave_time;
} }
......
...@@ -323,8 +323,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -323,8 +323,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2 Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2
9, 9,
BThreadTransferSrcScalarPerVector_E2, BThreadTransferSrcScalarPerVector_E2,
false, // don't move back src coordinate after threadwise copy, which will be fused with false, // don't move back src coordinate after threadwise copy, which will be fused
// MoveSrcSliceWindow() to save addr computation // with MoveSrcSliceWindow() to save addr computation
Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, I2, H2, W0, W1, I2, W2 Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, I2, H2, W0, W1, I2, W2
1, 1,
CThreadTransferDstScalarPerVector_K, CThreadTransferDstScalarPerVector_K,
...@@ -335,7 +335,10 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -335,7 +335,10 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack), decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack), decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack),
activ_type, activ_type,
1>; 1, // bias_type
1, // out_type
2 // add_type
>;
const auto a_e0_e1_k0_k1_e2_grid_desc = const auto a_e0_e1_k0_k1_e2_grid_desc =
GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc); GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
...@@ -528,16 +531,16 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -528,16 +531,16 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), ""); static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), ""); static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
const auto kernel = const auto kernel = kernel_gemm_dlops_v2_maxpool<
kernel_gemm_dlops_v2_add<GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>, remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>, remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>, remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>, remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
has_main_e0_block_loop>; has_main_e0_block_loop>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
......
...@@ -214,7 +214,6 @@ int main(int argc, char* argv[]) ...@@ -214,7 +214,6 @@ int main(int argc, char* argv[])
Tensor<in_data_t> in(in_lengths_host); Tensor<in_data_t> in(in_lengths_host);
Tensor<in_data_t> wei(wei_lengths_host); Tensor<in_data_t> wei(wei_lengths_host);
Tensor<in_data_t> add(add_lengths_host); Tensor<in_data_t> add(add_lengths_host);
Tensor<out_data_t> out_device(out_lengths_host);
Tensor<in_data_t> add_device(add_lengths_host); Tensor<in_data_t> add_device(add_lengths_host);
Tensor<in_data_t> add_host(add_lengths_host); Tensor<in_data_t> add_host(add_lengths_host);
Tensor<out_data_t> bias(bias_lengths_host); Tensor<out_data_t> bias(bias_lengths_host);
...@@ -308,7 +307,6 @@ int main(int argc, char* argv[]) ...@@ -308,7 +307,6 @@ int main(int argc, char* argv[])
in, in,
wei, wei,
bias, bias,
out_device,
add, add,
add_device, add_device,
nrepeat); nrepeat);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment