Merge branch 'develop' into conv_splitk_f32

a037693f · ltqin · 0694d6ed · 4041850f · a037693f · a037693f
Commit a037693f authored Dec 01, 2021 by ltqin
18 changed files
--- a/host/driver_offline/include/device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          ck::ActivTypeEnum_t activ_type,
+          typename InLengths,
+          typename WeiLengths,
+          typename MaxLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1(
+    const InLengths& in_n_c0_hi_wi_c1_lengths,
+    const WeiLengths& wei_k_c0_y_x_c1_lengths,
+    const MaxLengths& max_n_k0_hx_wx_k1_lengths,
+    const OutLengths& out_n_k0_ho_wo_k1_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c0_hi_wi_c1,
+    const Tensor<TInWei>& wei_k_c0_y_x_c1,
+    const Tensor<TOut>& bias_k0_k1,
+    Tensor<TOut>& out_n_k0_ho_wo_k1,
+    Tensor<TOut>& max_n_k0_hx_wx_k1,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    const auto N  = out_n_k0_ho_wo_k1_lengths[I0];
+    const auto K0 = out_n_k0_ho_wo_k1_lengths[I1];
+    const auto Ho = out_n_k0_ho_wo_k1_lengths[I2];
+    const auto Wo = out_n_k0_ho_wo_k1_lengths[I3];
+    const auto K1 = out_n_k0_ho_wo_k1_lengths[I4];
+    const auto C0 = in_n_c0_hi_wi_c1_lengths[I1];
+    const auto Hi = in_n_c0_hi_wi_c1_lengths[I2];
+    const auto Wi = in_n_c0_hi_wi_c1_lengths[I3];
+    const auto C1 = in_n_c0_hi_wi_c1_lengths[I4];
+    const auto K = wei_k_c0_y_x_c1_lengths[I0];
+    const auto Y = wei_k_c0_y_x_c1_lengths[I2];
+    const auto X = wei_k_c0_y_x_c1_lengths[I3];
+    const auto Hx = max_n_k0_hx_wx_k1_lengths[I2];
+    const auto Wx = max_n_k0_hx_wx_k1_lengths[I3];
+    DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
+                                          in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
+    DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
+    DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
+    DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
+                                           out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
+    DeviceMem max_n_k0_hx_wx_k1_device_buf(sizeof(TOut) *
+                                           max_n_k0_hx_wx_k1.mDesc.GetElementSpace());
+    in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
+    wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
+    bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
+    max_n_k0_hx_wx_k1_device_buf.ToDevice(max_n_k0_hx_wx_k1.mData.data());
+    constexpr index_t InWeiVectorSize = 8;
+    if(C1 % InWeiVectorSize != 0)
+    {
+        throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
+    }
+#if 0
+    constexpr index_t BlockSize = 256;
+    constexpr index_t KPerBlock  = 32;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 64;
+    constexpr index_t E1        = C0 * 9;
+    constexpr index_t E2        = 1;
+    constexpr index_t E1PerBlock = C0;
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = 1;
+    using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2   = Sequence<1, 9, 1, E2>;
+    using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>;
+    constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
+    constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
+    constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
+    constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
+#elif 1
+    constexpr index_t BlockSize = 64;
+    constexpr index_t KPerBlock  = 8;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 32;
+    constexpr index_t E1         = 2 * 9;
+    constexpr index_t E2         = 1;
+    constexpr index_t K2         = 2;
+    constexpr index_t E1PerBlock = 2;
+    constexpr index_t KPerThread  = KPerBlock;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = 1;
+    using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>;
+    using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 =
+        Sequence<1, E1PerBlock, 1, KPerBlock, 1>;
+    constexpr index_t ABlockTransferSrcScalarPerVector_E2  = E2;
+    constexpr index_t ABlockTransferDstScalarPerVector_E2  = E2;
+    constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
+    constexpr index_t CThreadTransferDstScalarPerVector_K  = InWeiVectorSize;
+#endif
+    if(KPerThread % InWeiVectorSize != 0)
+    {
+        throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
+    }
+    const auto in_n_c0_hi_wi_c1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2));
+    const auto wei_k_c0_y_x_c1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, E2));
+    const auto max_n_k0_hx_wx_k1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hx, Wx, K1));
+    const auto out_n_k0_ho_wo_k1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
+    constexpr auto conv_driver =
+        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool<
+            BlockSize,
+            typename vector_type<TInWei, InWeiVectorSize>::type,
+            TAcc,
+            TOut,
+            E1,
+            E2,
+            K2,
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            E1PerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
+            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
+            ABlockTransferSrcScalarPerVector_E2,
+            ABlockTransferDstScalarPerVector_E2,
+            BThreadTransferSrcScalarPerVector_E2,
+            CThreadTransferDstScalarPerVector_K,
+            activ_type>{};
+    std::cerr << "conv_bias_activ_maxpool_input_"
+              << "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
+              << "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_convout_n" << N << "k" << K0
+              << "h" << Ho << "w" << Wo << "k" << K1 << "_maxpoolout_n" << N << "k" << K0 << "h"
+              << Ho / 2 << "w" << Wo / 2 << "k" << K1 << std::endl;
+    for(int i = 0; i < 5; i++)
+    {
+        const auto ave_time =
+            conv_driver.Run(wei_k_c0_y_x_c1_desc,
+                            in_n_c0_hi_wi_c1_desc,
+                            out_n_k0_ho_wo_k1_desc,
+                            max_n_k0_hx_wx_k1_desc,
+                            conv_strides,
+                            conv_dilations,
+                            in_left_pads,
+                            in_right_pads,
+                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                                wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
+                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                                in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
+                            static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
+                            static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
+                            static_cast<TOut*>(max_n_k0_hx_wx_k1_device_buf.GetDeviceBuffer()),
+                            nrepeat);
+        {
+            float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C0 * C1 * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+    out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
+    max_n_k0_hx_wx_k1_device_buf.FromDevice(max_n_k0_hx_wx_k1.mData.data());
+}
--- a/host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
--- a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
--- a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
--- a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
+++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
--- a/host/driver_offline/include/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
--- a/host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
+++ b/host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
--- a/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -15,15 +15,13 @@
 #include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
 #include "device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
-#define USE_DYNAMIC_MODE 1
+#define USE_DYNAMIC_MODE 0
 #define USE_CONV_FWD_V4R4_NCHW 0
 #define USE_CONV_FWD_V4R4R2_NHWC 0
 #define USE_CONV_FWD_V6R1_NCHW 0
-#define USE_CONV_FWD_V5R1_NCHW 0
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 1
@@ -41,9 +39,8 @@ enum ConvForwardAlgo
    V4R4NCHW,      // 0
    V4R4R2NHWC,    // 1
    V6R1NCHW,      // 2
-    V5R1NCHW,      // 3
+    V4R4R2XDLNCHW, // 3
-    V4R4R2XDLNCHW, // 4
+    V4R4R4XDLNHWC  // 4
-    V4R4R4XDLNHWC  // 5
 };
 template <typename TIn,
@@ -97,7 +94,7 @@ void host_convolution_forward(const Tensor<TIn>& in,
        if constexpr(is_same<TOut, ushort>::value)
        {
-            out(n, k, ho, wo) = type_convert<ushort>(v);
+            out(n, k, ho, wo) = ck::type_convert<ushort>(static_cast<float>(v));
        }
        else
        {
@@ -134,7 +131,7 @@ void host_convolution_forward(const Tensor<TIn>& in,
        }
        if constexpr(is_same<TOut, ushort>::value)
        {
-            out(n, ho, wo, k) = ck::type_convert<ushort>(v);
+            out(n, ho, wo, k) = ck::type_convert<ushort>(static_cast<float>(v));
        }
        else
        {
@@ -237,8 +234,8 @@ int main(int argc, char* argv[])
    constexpr auto Y  = Number<3>{};
    constexpr auto X  = Number<3>{};
-    constexpr auto conv_stride_h   = I2;
+    constexpr auto conv_stride_h   = I1;
-    constexpr auto conv_stride_w   = I2;
+    constexpr auto conv_stride_w   = I1;
    constexpr auto conv_dilation_h = I1;
    constexpr auto conv_dilation_w = I1;
    constexpr auto in_left_pad_h   = I1;
@@ -253,15 +250,15 @@ int main(int argc, char* argv[])
    constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
 #endif
-#if 0
+#if 1
    using in_data_t  = float;
    using acc_data_t = float;
    using out_data_t = float;
-#elif 0
+#elif 1
    using in_data_t   = half_t;
    using acc_data_t  = float;
    using out_data_t  = half_t;
-#elif 1
+#elif 0
    using in_data_t  = ushort;
    using acc_data_t = float;
    using out_data_t = ushort;
@@ -472,33 +469,6 @@ int main(int argc, char* argv[])
    }
 #endif
-#if USE_CONV_FWD_V5R1_NCHW
-    if(algo == ConvForwardAlgo::V5R1NCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nchw();
-        device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw<in_data_t,
-                                                                           16,
-                                                                           acc_data_t,
-                                                                           out_data_t>(tmp[I0],
-                                                                                       tmp[I1],
-                                                                                       tmp[I2],
-                                                                                       tmp[I3],
-                                                                                       tmp[I4],
-                                                                                       tmp[I5],
-                                                                                       tmp[I6],
-                                                                                       in,
-                                                                                       wei,
-                                                                                       out_device,
-                                                                                       nrepeat);
-    }
-#endif
 #if USE_CONV_FWD_V4R4R2_XDL_NCHW
    if(algo == ConvForwardAlgo::V4R4R2XDLNCHW)
    {

--- a/host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
--- a/host/driver_offline/src/conv_maxpool_fwd_driver_offline_nchwc.cpp
+++ b/host/driver_offline/src/conv_maxpool_fwd_driver_offline_nchwc.cpp
--- a/host/driver_offline/src/gemm_driver_offline.cpp
+++ b/host/driver_offline/src/gemm_driver_offline.cpp
@@ -239,14 +239,10 @@ int main(int argc, char* argv[])
    using ab_data_t  = float;
    using acc_data_t = float;
    using c_data_t   = float;
-#elif 0
+#elif 1
    using ab_data_t  = half_t;
    using acc_data_t = float;
    using c_data_t   = half_t;
-#elif 1
-    using ab_data_t  = ushort;
-    using acc_data_t = float;
-    using c_data_t   = ushort;
 #elif 1
    using ab_data_t  = int8_t;
    using acc_data_t = int32_t;

--- a/host/host_tensor/include/conv_common.hpp
+++ b/host/host_tensor/include/conv_common.hpp
@@ -74,4 +74,17 @@ calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDes
    return std::size_t(2) * N * K * Ho * Wo * C * Y * X;
 }
+template <typename T>
+inline auto activ(T v, const ck::ActivTypeEnum_t activ_type)
+{
+    const T alpha = 0.3;
+    switch(activ_type)
+    {
+    case ck::ActivTypeEnum_t::None: return v;
+    case ck::ActivTypeEnum_t::LeakyRelu: return (v >= 0 ? v : alpha * v);
+    case ck::ActivTypeEnum_t::Sigmoid: return (1 / (1 + exp(-v)));
+    default: throw std::runtime_error("unsupported activ type"); break;
+    }
+}
 #endif
--- a/host/host_tensor/include/host_tensor.hpp
+++ b/host/host_tensor/include/host_tensor.hpp
@@ -257,6 +257,18 @@ struct Tensor
                                       mDesc.GetLengths()[3])(num_thread);
            break;
        }
+        case 5: {
+            auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) {
+                (*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
+            };
+            make_ParallelTensorFunctor(f,
+                                       mDesc.GetLengths()[0],
+                                       mDesc.GetLengths()[1],
+                                       mDesc.GetLengths()[2],
+                                       mDesc.GetLengths()[3],
+                                       mDesc.GetLengths()[4])(num_thread);
+            break;
+        }
        default: throw std::runtime_error("unspported dimension");
        }
    }

--- a/profiler/conv_profiler.cpp
+++ b/profiler/conv_profiler.cpp
--- a/profiler/gemm_profiler.cpp
+++ b/profiler/gemm_profiler.cpp
--- a/script/profile_gemm.sh
+++ b/script/profile_gemm.sh
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
--- a/test/magic_number_division/main.cpp
+++ b/test/magic_number_division/main.cpp