add bias

3e298e42 · Jing Zhang · 1e6d6782 · 3e298e42 · 3e298e42 · 3e298e42
Commit 3e298e42 authored Oct 13, 2021 by Jing Zhang
7 changed files
--- a/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
@@ -27,6 +27,7 @@ __global__ void
        kernel_gemm_dlops_v2_add(
            const FloatAB* __restrict__ p_a_grid,
            const FloatAB* __restrict__ p_b_grid,
+            const FloatC* __restrict__ p_bias_grid,
            FloatC* __restrict__ p_d_grid,
            const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc,
            const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
@@ -40,6 +41,7 @@ __global__ void
    GridwiseGemm::Run(p_a_grid,
                      p_b_grid,
+                      p_bias_grid,
                      p_d_grid,
                      p_shared_block,
                      a_e0_e1_k0_k1_e2_grid_desc,
@@ -66,6 +68,7 @@ __global__ void
 #endif
        kernel_gemm_dlops_v2_add(const FloatAB* __restrict__ p_a_grid,
                                 const FloatAB* __restrict__ p_b_grid,
+                                 const FloatC* __restrict__ p_bias_grid,
                                 FloatC* __restrict__ p_d_grid,
                                 const void CONSTANT* p_a_e0_e1_k0_k1_e2_grid_desc,
                                 const void CONSTANT* p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
@@ -95,6 +98,7 @@ __global__ void
    GridwiseGemm::Run(p_a_grid,
                      p_b_grid,
+                      p_bias_grid,
                      p_d_grid,
                      p_shared_block,
                      a_e0_e1_k0_k1_e2_grid_desc,
@@ -290,13 +294,13 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
        const auto K1 = Number<KPerBlock>{};
        const auto K0 = K / K1;
-        const auto H2 = Number<HoPerThread * 2>{};
+        const auto H2 = HoPerThread * 2;
        const auto H1 = Number<HoPerBlock / HoPerThread>{};
-        const auto H0 = Number<Hox2 / (H1 * H2)>{};
+        const auto H0 = Hox2 / (H1 * H2);
-        const auto W2 = Number<WoPerThread * 2>{};
+        const auto W2 = WoPerThread * 2;
        const auto W1 = Number<WoPerBlock / WoPerThread>{};
-        const auto W0 = Number<Wox2 / (W1 * W2)>{};
+        const auto W0 = Wox2 / (W1 * W2);
        const auto d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc = transform_tensor_descriptor(
            d_k_n_hox2_wox2_grid_desc,
@@ -340,10 +344,20 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
    using CBlockIdToBlockClusterAdaptor_K_N_H_W =
        decltype(MakeCBlockIdToKNHoWoBlockClusterAdaptor(CGridDesc_K_N_Ho_Wo{}));
+    __host__ __device__ static constexpr auto MakeBiasK0K1GridDescriptor(
+        const DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2& d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc)
+    {
+        const auto K0 = d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc.GetLength(I0);
+        const auto K1 = d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc.GetLength(I1);
+        return make_naive_tensor_descriptor_packed(make_tuple(K0, K1));
+    }
    template <bool HasMainE0BlockLoop>
    __device__ static void
    Run(const FloatAB* __restrict__ p_a_global,
        const FloatAB* __restrict__ p_b_global,
+        const FloatC* __restrict__ p_bias_global,
        FloatC* __restrict__ p_d_global,
        FloatAB* __restrict__ p_shared_block,
        const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
@@ -352,12 +366,26 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
        const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor,
        integral_constant<bool, HasMainE0BlockLoop>)
    {
+        // constexpr auto a_e0_e1_k0_k1_e2_grid_desc = AGridDesc_E0_E1_K0_K1_E2{};
+        // constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
+        // BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2{};
+        // constexpr auto d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc =
+        // DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2{};
+        // constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
+        // CBlockIdToBlockClusterAdaptor_K_N_H_W{};
+        const auto bias_k0_k1_grid_desc =
+            MakeBiasK0K1GridDescriptor(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc);
        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
            p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize());
        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
            p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize());
        auto d_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
            p_d_global, d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc.GetElementSpaceSize());
+        auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
        constexpr auto HasMainE1BlockLoop       = CalculateHasMainE1BlockLoop();
        constexpr auto HasDoubleTailE1BlockLoop = CalculateHasDoubleTailE1BlockLoop();
@@ -747,6 +775,57 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
            });
        }
+        // bias
+        {
+            constexpr auto bias_k0_k1_thread_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(I1, Number<KPerThread>{}));
+            StaticBuffer<AddressSpaceEnum_t::Vgpr,
+                         FloatC,
+                         bias_k0_k1_thread_desc.GetElementSpaceSize(),
+                         true>
+                bias_thread_buf;
+            const index_t k_thread_data_on_global = k_thread_id * KPerThread;
+            auto bias_threadwise_transfer =
+                ThreadwiseTensorSliceTransfer_v2<FloatC,
+                                                 FloatC,
+                                                 decltype(bias_k0_k1_grid_desc),
+                                                 decltype(bias_k0_k1_thread_desc),
+                                                 Sequence<I1, Number<KPerThread>{}>,
+                                                 Sequence<0, 1>,
+                                                 1,
+                                                 CThreadTransferDstScalarPerVector,
+                                                 false,
+                                                 true>(
+                    bias_k0_k1_grid_desc,
+                    make_multi_index(k_block_work_id, k_thread_data_on_global));
+            constexpr auto bias_k0_k1_global_tensor_step_hacks = make_tuple(
+                make_tuple(Sequence<0>{}, Sequence<0>{}), make_tuple(Sequence<0>{}, Sequence<0>{}));
+            bias_threadwise_transfer.Run(bias_k0_k1_grid_desc,
+                                         bias_global_buf,
+                                         bias_k0_k1_thread_desc,
+                                         make_tuple(I0, I0),
+                                         bias_thread_buf,
+                                         bias_k0_k1_global_tensor_step_hacks);
+#if 1
+            static_for<0, KPerThread, 1>{}([&](auto ki) {
+                static_for<0, HoPerThread, 1>{}([&](auto hi) {
+                    static_for<0, WoPerThread, 1>{}([&](auto wi) {
+                        constexpr index_t c_offset = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
+                            make_tuple(ki, 0, hi, wi));
+                        c_thread_buf(Number<c_offset>{}) =
+                            c_thread_buf[Number<c_offset>{}] + bias_thread_buf[ki];
+                    });
+                });
+            });
+#endif
+        }
        // Resize_Add
        {
            constexpr auto HoPerThreadx2 = HoPerThread * 2;
@@ -843,7 +922,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
                CThreadTransferSrcDstAccessOrder,
                CThreadTransferSrcDstVectorDim,
                CThreadTransferDstScalarPerVector,
-                InMemoryDataOperationEnum_t::Add, // CGlobalMemoryDataOperation,
+                CGlobalMemoryDataOperation,
                1,
                true>(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc,
                      make_multi_index(k_block_work_id,

--- a/composable_kernel/include/utility/amd_buffer_addressing.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp
@@ -537,6 +537,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
        }
        else if constexpr(N == 8)
        {
+#if 0
            vector_type<half_t, 8> tmp{src_thread_data};
            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
@@ -550,6 +551,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset + 4 * sizeof(half_t),
                                                0);
+#else
+            llvm_amdgcn_raw_buffer_store_fp32x4(as_type<float4_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+#endif
        }
    }
    else if constexpr(is_same<T, int32_t>::value)

--- a/host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
@@ -26,9 +26,9 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
    const InRightPads& in_right_pads,
    const Tensor<TInWei>& in_n_c0_hi_wi_c1,
    const Tensor<TInWei>& wei_k_c0_y_x_c1,
+    const Tensor<TOut>& bias_k0_k1,
    const Tensor<TOut>& add_n_k0_hox2_wox2_k1,
    Tensor<TOut>& add_n_k0_hox2_wox2_k1_out,
-    Tensor<TOut>& out_n_k0_ho_wo_k1,
    ck::index_t nrepeat)
 {
    using namespace ck;
@@ -62,13 +62,13 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
    DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
                                          in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
    DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
+    DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
    DeviceMem add_n_k0_hox2_wox2_k1_device_buf(sizeof(TOut) *
                                               add_n_k0_hox2_wox2_k1.mDesc.GetElementSpace());
-    DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
-                                           out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
    in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
    wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
+    bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
    add_n_k0_hox2_wox2_k1_device_buf.ToDevice(add_n_k0_hox2_wox2_k1.mData.data());
    constexpr index_t InWeiVectorSize = 8;
@@ -187,8 +187,8 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
                                wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
                                in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
+                            static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
                            static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()),
-                            static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
                            nrepeat);
        {
@@ -214,10 +214,9 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
                        wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
                        in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
+                    static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
                    static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()),
-                    static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
                    0);
    add_n_k0_hox2_wox2_k1_device_buf.FromDevice(add_n_k0_hox2_wox2_k1_out.mData.data());
-    out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
 }
--- a/host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
@@ -48,8 +48,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
                       const InRightPads& in_right_pads,
                       const FloatAB* __restrict__ p_a_grid,
                       const FloatAB* __restrict__ p_b_grid,
+                       const FloatC* __restrict__ p_bias_grid,
                       FloatC* __restrict__ p_d_grid,
-                       FloatC* __restrict__ p_c_grid,
                       const int nrepeat) const
    {
        using namespace ck;
@@ -279,7 +279,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
            FloatAB,
            FloatAcc,
            FloatC,
-            InMemoryDataOperationEnum_t::Set,
+            InMemoryDataOperationEnum_t::Add,
            decltype(a_e0_e1_k_e2_grid_desc),
            decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
            decltype(d_k_n_hopx2_wopx2_grid_desc),
@@ -366,6 +366,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
                                              0,
                                              p_a_grid,
                                              p_b_grid,
+                                              p_bias_grid,
                                              p_d_grid,
                                              a_e0_e1_k0_k1_e2_grid_desc,
                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
@@ -391,6 +392,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
                                              0,
                                              p_a_grid,
                                              p_b_grid,
+                                              p_bias_grid,
                                              p_d_grid,
                                              a_e0_e1_k0_k1_e2_grid_desc,
                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
@@ -436,6 +438,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
                0,
                p_a_grid,
                p_b_grid,
+                p_bias_grid,
                p_d_grid,
                cast_pointer_to_constant_address_space(
                    a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
@@ -467,6 +470,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
                0,
                p_a_grid,
                p_b_grid,
+                p_bias_grid,
                p_d_grid,
                cast_pointer_to_constant_address_space(
                    a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),

--- a/host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
+++ b/host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
@@ -105,7 +105,7 @@ int main(int argc, char* argv[])
    constexpr auto C1            = Number<8>{};
    constexpr auto K1            = Number<8>{};
    constexpr auto K0            = Number<8>{};
-#elif 0
+#elif 1
    constexpr auto N  = Number<1>{};
    constexpr auto Hi = Number<540>{};
    constexpr auto Wi = Number<960>{};
@@ -182,7 +182,7 @@ int main(int argc, char* argv[])
 #endif
    std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
-        add_lengths_host(5);
+        add_lengths_host(5), bias_lengths_host(2);
    in_lengths_host[0] = static_cast<std::size_t>(N);
    in_lengths_host[1] = static_cast<std::size_t>(C0);
@@ -208,18 +208,21 @@ int main(int argc, char* argv[])
    add_lengths_host[3] = static_cast<std::size_t>(Wox2);
    add_lengths_host[4] = static_cast<std::size_t>(K1);
+    bias_lengths_host[0] = static_cast<std::size_t>(K0);
+    bias_lengths_host[1] = static_cast<std::size_t>(K1);
    Tensor<in_data_t> in(in_lengths_host);
    Tensor<in_data_t> wei(wei_lengths_host);
    Tensor<in_data_t> add(add_lengths_host);
-    Tensor<out_data_t> out_host(out_lengths_host);
    Tensor<out_data_t> out_device(out_lengths_host);
    Tensor<in_data_t> add_device(add_lengths_host);
    Tensor<in_data_t> add_host(add_lengths_host);
+    Tensor<out_data_t> bias(bias_lengths_host);
+    Tensor<out_data_t> out_host(out_lengths_host);
    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
    ostream_HostTensorDescriptor(add.mDesc, std::cout << "add: ");
-    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
@@ -262,6 +265,7 @@ int main(int argc, char* argv[])
        wei.GenerateTensorValue(gen_wei, num_thread);
    }
+    bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
    add.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
    auto f_make_for_device_nchwc = [&]() {
@@ -303,9 +307,9 @@ int main(int argc, char* argv[])
            tmp[I7], // in_right_pads_dev
            in,
            wei,
+            bias,
            add,
            add_device,
-            out_device,
            nrepeat);
    }
 #endif
@@ -315,6 +319,7 @@ int main(int argc, char* argv[])
        host_direct_convolution_add_nchwc(in,
                                          wei,
                                          add,
+                                          bias,
                                          add_host,
                                          out_host,
                                          make_tuple(conv_stride_h, conv_stride_w),
@@ -329,7 +334,6 @@ int main(int argc, char* argv[])
        {
            // LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
            // LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
-            // LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
            // LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") <<
            // std::endl;
            LogRangeAsType<float>(std::cout << "add_host: ", add_host.mData, ",") << std::endl;

--- a/host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
@@ -102,7 +102,7 @@ int main(int argc, char* argv[])
    constexpr auto C1          = Number<8>{};
    constexpr auto K1          = Number<8>{};
    constexpr auto K0          = Number<8>{};
-#elif 0
+#elif 1
    constexpr auto N  = Number<1>{};
    constexpr auto Hi = Number<540>{};
    constexpr auto Wi = Number<960>{};

--- a/host/host_tensor/include/host_conv.hpp
+++ b/host/host_tensor/include/host_conv.hpp
@@ -167,6 +167,7 @@ template <typename TIn,
 void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
                                       const Tensor<TWei>& wei,
                                       const Tensor<TOut>& add,
+                                       const Tensor<TOut>& bias,
                                       Tensor<TOut>& add_host,
                                       Tensor<TOut>& out_host,
                                       const ConvStrides& conv_strides,
@@ -204,7 +205,7 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
            }
        }
-        v = activ(v, activ_type);
+        v = activ(v, activ_type) + bias(k0, k1);
        const int hox2 = ho * 2;
        const int wox2 = wo * 2;