"...git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "67533c798ca95172d7b9999dd28d9d6e57a67432"
Commit 3e298e42 authored by Jing Zhang's avatar Jing Zhang
Browse files

add bias

parent 1e6d6782
...@@ -27,6 +27,7 @@ __global__ void ...@@ -27,6 +27,7 @@ __global__ void
kernel_gemm_dlops_v2_add( kernel_gemm_dlops_v2_add(
const FloatAB* __restrict__ p_a_grid, const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid, const FloatAB* __restrict__ p_b_grid,
const FloatC* __restrict__ p_bias_grid,
FloatC* __restrict__ p_d_grid, FloatC* __restrict__ p_d_grid,
const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc, const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc,
const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
...@@ -40,6 +41,7 @@ __global__ void ...@@ -40,6 +41,7 @@ __global__ void
GridwiseGemm::Run(p_a_grid, GridwiseGemm::Run(p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid,
p_d_grid, p_d_grid,
p_shared_block, p_shared_block,
a_e0_e1_k0_k1_e2_grid_desc, a_e0_e1_k0_k1_e2_grid_desc,
...@@ -66,6 +68,7 @@ __global__ void ...@@ -66,6 +68,7 @@ __global__ void
#endif #endif
kernel_gemm_dlops_v2_add(const FloatAB* __restrict__ p_a_grid, kernel_gemm_dlops_v2_add(const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid, const FloatAB* __restrict__ p_b_grid,
const FloatC* __restrict__ p_bias_grid,
FloatC* __restrict__ p_d_grid, FloatC* __restrict__ p_d_grid,
const void CONSTANT* p_a_e0_e1_k0_k1_e2_grid_desc, const void CONSTANT* p_a_e0_e1_k0_k1_e2_grid_desc,
const void CONSTANT* p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, const void CONSTANT* p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
...@@ -95,6 +98,7 @@ __global__ void ...@@ -95,6 +98,7 @@ __global__ void
GridwiseGemm::Run(p_a_grid, GridwiseGemm::Run(p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid,
p_d_grid, p_d_grid,
p_shared_block, p_shared_block,
a_e0_e1_k0_k1_e2_grid_desc, a_e0_e1_k0_k1_e2_grid_desc,
...@@ -290,13 +294,13 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add ...@@ -290,13 +294,13 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
const auto K1 = Number<KPerBlock>{}; const auto K1 = Number<KPerBlock>{};
const auto K0 = K / K1; const auto K0 = K / K1;
const auto H2 = Number<HoPerThread * 2>{}; const auto H2 = HoPerThread * 2;
const auto H1 = Number<HoPerBlock / HoPerThread>{}; const auto H1 = Number<HoPerBlock / HoPerThread>{};
const auto H0 = Number<Hox2 / (H1 * H2)>{}; const auto H0 = Hox2 / (H1 * H2);
const auto W2 = Number<WoPerThread * 2>{}; const auto W2 = WoPerThread * 2;
const auto W1 = Number<WoPerBlock / WoPerThread>{}; const auto W1 = Number<WoPerBlock / WoPerThread>{};
const auto W0 = Number<Wox2 / (W1 * W2)>{}; const auto W0 = Wox2 / (W1 * W2);
const auto d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc = transform_tensor_descriptor( const auto d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc = transform_tensor_descriptor(
d_k_n_hox2_wox2_grid_desc, d_k_n_hox2_wox2_grid_desc,
...@@ -340,10 +344,20 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add ...@@ -340,10 +344,20 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
using CBlockIdToBlockClusterAdaptor_K_N_H_W = using CBlockIdToBlockClusterAdaptor_K_N_H_W =
decltype(MakeCBlockIdToKNHoWoBlockClusterAdaptor(CGridDesc_K_N_Ho_Wo{})); decltype(MakeCBlockIdToKNHoWoBlockClusterAdaptor(CGridDesc_K_N_Ho_Wo{}));
__host__ __device__ static constexpr auto MakeBiasK0K1GridDescriptor(
const DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2& d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc)
{
const auto K0 = d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc.GetLength(I0);
const auto K1 = d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc.GetLength(I1);
return make_naive_tensor_descriptor_packed(make_tuple(K0, K1));
}
template <bool HasMainE0BlockLoop> template <bool HasMainE0BlockLoop>
__device__ static void __device__ static void
Run(const FloatAB* __restrict__ p_a_global, Run(const FloatAB* __restrict__ p_a_global,
const FloatAB* __restrict__ p_b_global, const FloatAB* __restrict__ p_b_global,
const FloatC* __restrict__ p_bias_global,
FloatC* __restrict__ p_d_global, FloatC* __restrict__ p_d_global,
FloatAB* __restrict__ p_shared_block, FloatAB* __restrict__ p_shared_block,
const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc, const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
...@@ -352,12 +366,26 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add ...@@ -352,12 +366,26 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor, const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>) integral_constant<bool, HasMainE0BlockLoop>)
{ {
// constexpr auto a_e0_e1_k0_k1_e2_grid_desc = AGridDesc_E0_E1_K0_K1_E2{};
// constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
// BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2{};
// constexpr auto d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc =
// DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2{};
// constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
// CBlockIdToBlockClusterAdaptor_K_N_H_W{};
const auto bias_k0_k1_grid_desc =
MakeBiasK0K1GridDescriptor(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc);
const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>( const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize()); p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize());
const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>( const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize()); p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize());
auto d_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>( auto d_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_d_global, d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc.GetElementSpaceSize()); p_d_global, d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc.GetElementSpaceSize());
auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
constexpr auto HasMainE1BlockLoop = CalculateHasMainE1BlockLoop(); constexpr auto HasMainE1BlockLoop = CalculateHasMainE1BlockLoop();
constexpr auto HasDoubleTailE1BlockLoop = CalculateHasDoubleTailE1BlockLoop(); constexpr auto HasDoubleTailE1BlockLoop = CalculateHasDoubleTailE1BlockLoop();
...@@ -747,6 +775,57 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add ...@@ -747,6 +775,57 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
}); });
} }
// bias
{
constexpr auto bias_k0_k1_thread_desc =
make_naive_tensor_descriptor_packed(make_tuple(I1, Number<KPerThread>{}));
StaticBuffer<AddressSpaceEnum_t::Vgpr,
FloatC,
bias_k0_k1_thread_desc.GetElementSpaceSize(),
true>
bias_thread_buf;
const index_t k_thread_data_on_global = k_thread_id * KPerThread;
auto bias_threadwise_transfer =
ThreadwiseTensorSliceTransfer_v2<FloatC,
FloatC,
decltype(bias_k0_k1_grid_desc),
decltype(bias_k0_k1_thread_desc),
Sequence<I1, Number<KPerThread>{}>,
Sequence<0, 1>,
1,
CThreadTransferDstScalarPerVector,
false,
true>(
bias_k0_k1_grid_desc,
make_multi_index(k_block_work_id, k_thread_data_on_global));
constexpr auto bias_k0_k1_global_tensor_step_hacks = make_tuple(
make_tuple(Sequence<0>{}, Sequence<0>{}), make_tuple(Sequence<0>{}, Sequence<0>{}));
bias_threadwise_transfer.Run(bias_k0_k1_grid_desc,
bias_global_buf,
bias_k0_k1_thread_desc,
make_tuple(I0, I0),
bias_thread_buf,
bias_k0_k1_global_tensor_step_hacks);
#if 1
static_for<0, KPerThread, 1>{}([&](auto ki) {
static_for<0, HoPerThread, 1>{}([&](auto hi) {
static_for<0, WoPerThread, 1>{}([&](auto wi) {
constexpr index_t c_offset = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
make_tuple(ki, 0, hi, wi));
c_thread_buf(Number<c_offset>{}) =
c_thread_buf[Number<c_offset>{}] + bias_thread_buf[ki];
});
});
});
#endif
}
// Resize_Add // Resize_Add
{ {
constexpr auto HoPerThreadx2 = HoPerThread * 2; constexpr auto HoPerThreadx2 = HoPerThread * 2;
...@@ -843,7 +922,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add ...@@ -843,7 +922,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim, CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector, CThreadTransferDstScalarPerVector,
InMemoryDataOperationEnum_t::Add, // CGlobalMemoryDataOperation, CGlobalMemoryDataOperation,
1, 1,
true>(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc, true>(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc,
make_multi_index(k_block_work_id, make_multi_index(k_block_work_id,
......
...@@ -537,6 +537,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src ...@@ -537,6 +537,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
#if 0
vector_type<half_t, 8> tmp{src_thread_data}; vector_type<half_t, 8> tmp{src_thread_data};
llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}], llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
...@@ -550,6 +551,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src ...@@ -550,6 +551,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset + 4 * sizeof(half_t), dst_wave_addr_offset + 4 * sizeof(half_t),
0); 0);
#else
llvm_amdgcn_raw_buffer_store_fp32x4(as_type<float4_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
0);
#endif
} }
} }
else if constexpr(is_same<T, int32_t>::value) else if constexpr(is_same<T, int32_t>::value)
......
...@@ -26,9 +26,9 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0 ...@@ -26,9 +26,9 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
const InRightPads& in_right_pads, const InRightPads& in_right_pads,
const Tensor<TInWei>& in_n_c0_hi_wi_c1, const Tensor<TInWei>& in_n_c0_hi_wi_c1,
const Tensor<TInWei>& wei_k_c0_y_x_c1, const Tensor<TInWei>& wei_k_c0_y_x_c1,
const Tensor<TOut>& bias_k0_k1,
const Tensor<TOut>& add_n_k0_hox2_wox2_k1, const Tensor<TOut>& add_n_k0_hox2_wox2_k1,
Tensor<TOut>& add_n_k0_hox2_wox2_k1_out, Tensor<TOut>& add_n_k0_hox2_wox2_k1_out,
Tensor<TOut>& out_n_k0_ho_wo_k1,
ck::index_t nrepeat) ck::index_t nrepeat)
{ {
using namespace ck; using namespace ck;
...@@ -62,13 +62,13 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0 ...@@ -62,13 +62,13 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) * DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
in_n_c0_hi_wi_c1.mDesc.GetElementSpace()); in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace()); DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
DeviceMem add_n_k0_hox2_wox2_k1_device_buf(sizeof(TOut) * DeviceMem add_n_k0_hox2_wox2_k1_device_buf(sizeof(TOut) *
add_n_k0_hox2_wox2_k1.mDesc.GetElementSpace()); add_n_k0_hox2_wox2_k1.mDesc.GetElementSpace());
DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data()); in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data()); wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
add_n_k0_hox2_wox2_k1_device_buf.ToDevice(add_n_k0_hox2_wox2_k1.mData.data()); add_n_k0_hox2_wox2_k1_device_buf.ToDevice(add_n_k0_hox2_wox2_k1.mData.data());
constexpr index_t InWeiVectorSize = 8; constexpr index_t InWeiVectorSize = 8;
...@@ -187,8 +187,8 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0 ...@@ -187,8 +187,8 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()), wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>( static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()), in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()), static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
nrepeat); nrepeat);
{ {
...@@ -214,10 +214,9 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0 ...@@ -214,10 +214,9 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()), wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>( static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()), in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()), static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
0); 0);
add_n_k0_hox2_wox2_k1_device_buf.FromDevice(add_n_k0_hox2_wox2_k1_out.mData.data()); add_n_k0_hox2_wox2_k1_device_buf.FromDevice(add_n_k0_hox2_wox2_k1_out.mData.data());
out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
} }
...@@ -48,8 +48,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -48,8 +48,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
const InRightPads& in_right_pads, const InRightPads& in_right_pads,
const FloatAB* __restrict__ p_a_grid, const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid, const FloatAB* __restrict__ p_b_grid,
const FloatC* __restrict__ p_bias_grid,
FloatC* __restrict__ p_d_grid, FloatC* __restrict__ p_d_grid,
FloatC* __restrict__ p_c_grid,
const int nrepeat) const const int nrepeat) const
{ {
using namespace ck; using namespace ck;
...@@ -279,7 +279,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -279,7 +279,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
FloatAB, FloatAB,
FloatAcc, FloatAcc,
FloatC, FloatC,
InMemoryDataOperationEnum_t::Set, InMemoryDataOperationEnum_t::Add,
decltype(a_e0_e1_k_e2_grid_desc), decltype(a_e0_e1_k_e2_grid_desc),
decltype(b_e0_e1_n_ho_wo_e2_grid_desc), decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
decltype(d_k_n_hopx2_wopx2_grid_desc), decltype(d_k_n_hopx2_wopx2_grid_desc),
...@@ -366,6 +366,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -366,6 +366,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
0, 0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid,
p_d_grid, p_d_grid,
a_e0_e1_k0_k1_e2_grid_desc, a_e0_e1_k0_k1_e2_grid_desc,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
...@@ -391,6 +392,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -391,6 +392,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
0, 0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid,
p_d_grid, p_d_grid,
a_e0_e1_k0_k1_e2_grid_desc, a_e0_e1_k0_k1_e2_grid_desc,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
...@@ -436,6 +438,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -436,6 +438,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
0, 0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid,
p_d_grid, p_d_grid,
cast_pointer_to_constant_address_space( cast_pointer_to_constant_address_space(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()), a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
...@@ -467,6 +470,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -467,6 +470,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
0, 0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid,
p_d_grid, p_d_grid,
cast_pointer_to_constant_address_space( cast_pointer_to_constant_address_space(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()), a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
......
...@@ -105,7 +105,7 @@ int main(int argc, char* argv[]) ...@@ -105,7 +105,7 @@ int main(int argc, char* argv[])
constexpr auto C1 = Number<8>{}; constexpr auto C1 = Number<8>{};
constexpr auto K1 = Number<8>{}; constexpr auto K1 = Number<8>{};
constexpr auto K0 = Number<8>{}; constexpr auto K0 = Number<8>{};
#elif 0 #elif 1
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
constexpr auto Hi = Number<540>{}; constexpr auto Hi = Number<540>{};
constexpr auto Wi = Number<960>{}; constexpr auto Wi = Number<960>{};
...@@ -182,7 +182,7 @@ int main(int argc, char* argv[]) ...@@ -182,7 +182,7 @@ int main(int argc, char* argv[])
#endif #endif
std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5), std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
add_lengths_host(5); add_lengths_host(5), bias_lengths_host(2);
in_lengths_host[0] = static_cast<std::size_t>(N); in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(C0); in_lengths_host[1] = static_cast<std::size_t>(C0);
...@@ -208,18 +208,21 @@ int main(int argc, char* argv[]) ...@@ -208,18 +208,21 @@ int main(int argc, char* argv[])
add_lengths_host[3] = static_cast<std::size_t>(Wox2); add_lengths_host[3] = static_cast<std::size_t>(Wox2);
add_lengths_host[4] = static_cast<std::size_t>(K1); add_lengths_host[4] = static_cast<std::size_t>(K1);
bias_lengths_host[0] = static_cast<std::size_t>(K0);
bias_lengths_host[1] = static_cast<std::size_t>(K1);
Tensor<in_data_t> in(in_lengths_host); Tensor<in_data_t> in(in_lengths_host);
Tensor<in_data_t> wei(wei_lengths_host); Tensor<in_data_t> wei(wei_lengths_host);
Tensor<in_data_t> add(add_lengths_host); Tensor<in_data_t> add(add_lengths_host);
Tensor<out_data_t> out_host(out_lengths_host);
Tensor<out_data_t> out_device(out_lengths_host); Tensor<out_data_t> out_device(out_lengths_host);
Tensor<in_data_t> add_device(add_lengths_host); Tensor<in_data_t> add_device(add_lengths_host);
Tensor<in_data_t> add_host(add_lengths_host); Tensor<in_data_t> add_host(add_lengths_host);
Tensor<out_data_t> bias(bias_lengths_host);
Tensor<out_data_t> out_host(out_lengths_host);
ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: "); ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: "); ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
ostream_HostTensorDescriptor(add.mDesc, std::cout << "add: "); ostream_HostTensorDescriptor(add.mDesc, std::cout << "add: ");
ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w)); print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w)); print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
...@@ -262,6 +265,7 @@ int main(int argc, char* argv[]) ...@@ -262,6 +265,7 @@ int main(int argc, char* argv[])
wei.GenerateTensorValue(gen_wei, num_thread); wei.GenerateTensorValue(gen_wei, num_thread);
} }
bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
add.GenerateTensorValue(GeneratorTensor_1{}, num_thread); add.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
auto f_make_for_device_nchwc = [&]() { auto f_make_for_device_nchwc = [&]() {
...@@ -303,9 +307,9 @@ int main(int argc, char* argv[]) ...@@ -303,9 +307,9 @@ int main(int argc, char* argv[])
tmp[I7], // in_right_pads_dev tmp[I7], // in_right_pads_dev
in, in,
wei, wei,
bias,
add, add,
add_device, add_device,
out_device,
nrepeat); nrepeat);
} }
#endif #endif
...@@ -315,6 +319,7 @@ int main(int argc, char* argv[]) ...@@ -315,6 +319,7 @@ int main(int argc, char* argv[])
host_direct_convolution_add_nchwc(in, host_direct_convolution_add_nchwc(in,
wei, wei,
add, add,
bias,
add_host, add_host,
out_host, out_host,
make_tuple(conv_stride_h, conv_stride_w), make_tuple(conv_stride_h, conv_stride_w),
...@@ -329,7 +334,6 @@ int main(int argc, char* argv[]) ...@@ -329,7 +334,6 @@ int main(int argc, char* argv[])
{ {
// LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl; // LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl; // LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << // LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") <<
// std::endl; // std::endl;
LogRangeAsType<float>(std::cout << "add_host: ", add_host.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "add_host: ", add_host.mData, ",") << std::endl;
......
...@@ -102,7 +102,7 @@ int main(int argc, char* argv[]) ...@@ -102,7 +102,7 @@ int main(int argc, char* argv[])
constexpr auto C1 = Number<8>{}; constexpr auto C1 = Number<8>{};
constexpr auto K1 = Number<8>{}; constexpr auto K1 = Number<8>{};
constexpr auto K0 = Number<8>{}; constexpr auto K0 = Number<8>{};
#elif 0 #elif 1
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
constexpr auto Hi = Number<540>{}; constexpr auto Hi = Number<540>{};
constexpr auto Wi = Number<960>{}; constexpr auto Wi = Number<960>{};
......
...@@ -167,6 +167,7 @@ template <typename TIn, ...@@ -167,6 +167,7 @@ template <typename TIn,
void host_direct_convolution_add_nchwc(const Tensor<TIn>& in, void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
const Tensor<TWei>& wei, const Tensor<TWei>& wei,
const Tensor<TOut>& add, const Tensor<TOut>& add,
const Tensor<TOut>& bias,
Tensor<TOut>& add_host, Tensor<TOut>& add_host,
Tensor<TOut>& out_host, Tensor<TOut>& out_host,
const ConvStrides& conv_strides, const ConvStrides& conv_strides,
...@@ -204,7 +205,7 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in, ...@@ -204,7 +205,7 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
} }
} }
v = activ(v, activ_type); v = activ(v, activ_type) + bias(k0, k1);
const int hox2 = ho * 2; const int hox2 = ho * 2;
const int wox2 = wo * 2; const int wox2 = wo * 2;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment