merge

f289221b · root · 961e6810 · 5a88b0b0 · f289221b · f289221b
Commit f289221b authored Mar 18, 2021 by root
3 changed files
--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -3,7 +3,10 @@
 #include "host_tensor.hpp"
 #include "driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"

-template <class T,
+template <class TInWei,
+          ck::index_t InWeiVectorSize,
+          class TAcc,
+          class TOut,
          class InDesc,
          class WeiDesc,
          class OutDesc,
@@ -11,35 +14,33 @@ template <class T,
          class ConvDilations,
          class InLeftPads,
          class InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
-                                                                          const Tensor<T>& in_nchw,
+void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
+    InDesc,
+    const Tensor<TInWei>& in_n_c_hi_wi,
    WeiDesc,
-                                                                          const Tensor<T>& wei_kcyx,
+    const Tensor<TInWei>& wei_k_c_y_x,
    OutDesc,
-                                                                          Tensor<T>& out_nkhw,
+    Tensor<TOut>& out_n_k_ho_wo,
    ConvStrides,
    ConvDilations,
    InLeftPads,
    InRightPads,
    ck::index_t nrepeat)
 {
-    std::cout << "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw"
-              << std::endl;
-
    using namespace ck;

-    using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
+    std::cout << "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw"
+              << std::endl;

-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
-    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
-    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
+    DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());

-    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
-    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
-    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+    in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());

-#if 1
+#if 0
    // run-time variables
    const auto in_n_c_hi_wi_desc =
        make_dynamic_naive_tensor_descriptor_packed_v2(to_multi_index(InDesc::GetLengths()));
@@ -67,7 +68,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    const auto in_right_pads  = sequence_to_tuple_of_number(InRightPads{});
 #endif

-#if 0
+#if 1
    // cdata = 16, BlockSize = 64, 16x64x4
    constexpr index_t BlockSize = 64;

@@ -368,9 +369,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
 #endif
        <BlockSize,
-         TDevice,
-         TDevice,
-         TDevice,
+         typename vector_type<TInWei, InWeiVectorSize>::type,
+         TAcc,
+         TOut,
         GemmMPerBlock,
         GemmNPerBlock,
         GemmKPerBlock,
@@ -398,9 +399,11 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
                    conv_dilations,
                    in_left_pads,
                    in_right_pads,
-                    static_cast<TDevice*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                    static_cast<TDevice*>(in_nchw_device_buf.GetDeviceBuffer()),
-                    static_cast<TDevice*>(out_nkhw_device_buf.GetDeviceBuffer()));
+                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                        wei_k_c_y_x_device_buf.GetDeviceBuffer()),
+                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                        in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
+                    static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()));

-    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
+    out_n_k_ho_wo_device_buf.FromDevice(out_n_k_ho_wo.mData.data());
 }
--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
@@ -13,26 +13,25 @@ template <class TInWei,
          class ConvStrides,
          class ConvDilations,
          class InLeftPads,
-          class InRightPads,
-          class T>
+          class InRightPads>
 void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
    InDesc,
-    const Tensor<T>& in_n_c_hi_wi,
+    const Tensor<TInWei>& in_n_c_hi_wi,
    WeiDesc,
-    const Tensor<T>& wei_k_c_y_x,
+    const Tensor<TInWei>& wei_k_c_y_x,
    OutDesc,
-    Tensor<T>& out_n_k_ho_wo,
+    Tensor<TOut>& out_n_k_ho_wo,
    ConvStrides,
    ConvDilations,
    InLeftPads,
    InRightPads,
    ck::index_t nrepeat)
 {
+    using namespace ck;
+
    std::cout << "device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk"
              << std::endl;

-    using namespace ck;
-
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -24,20 +24,6 @@ int main(int argc, char* argv[])
    using namespace ck;

 #if 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 16;
-    constexpr index_t HI = 1;
-    constexpr index_t WI = 64;
-    constexpr index_t K  = 16;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    using LeftPads  = Sequence<1, 1>;
-    using RightPads = Sequence<1, 1>;
-#elif 0
    constexpr index_t N  = 1;
    constexpr index_t C  = 16;
    constexpr index_t HI = 1080;
@@ -151,7 +137,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
-#elif 1
+#elif 0
    // 3x3, 71x71
    constexpr index_t N  = 128;
    constexpr index_t C  = 192;
@@ -634,9 +620,9 @@ int main(int argc, char* argv[])
 #if 1
    using in_data_t                  = float;
    constexpr index_t in_vector_size = 1;
-    using out_data_t                 = float;
    using acc_data_t                 = float;
-#else
+    using out_data_t                 = float;
+#elif 1
    using in_data_t                  = int8_t;
    constexpr index_t in_vector_size = 4;
    using acc_data_t                 = int32_t;
@@ -720,20 +706,13 @@ int main(int argc, char* argv[])
                                                                 LeftPads{},
                                                                 RightPads{},
                                                                 nrepeat);
-#elif 0
-    device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
-                                                                         in_nchw,
-                                                                         wei_kcyx_desc,
-                                                                         wei_kcyx,
-                                                                         out_nkhw_desc,
-                                                                         out_nkhw_device,
-                                                                         ConvStrides{},
-                                                                         ConvDilations{},
-                                                                         LeftPads{},
-                                                                         RightPads{},
-                                                                         nrepeat);
-#elif 0
-    device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(in_nchw_desc,
+#elif 1
+    device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw<in_data_t,
+                                                                         in_vector_size,
+                                                                         acc_data_t,
+                                                                         out_data_t>
+
+        (in_nchw_desc,
         in_nchw,
         wei_kcyx_desc,
         wei_kcyx,