v4r4r4 fp16

47b26f0f · ltqin · 3c150a8f · 47b26f0f · 47b26f0f
Commit 47b26f0f authored Sep 23, 2021 by ltqin
2 changed files
--- a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_kyxc_nhwk.hpp
@@ -4,7 +4,8 @@
 #include "transform_backward_weight_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
 #include "driver_gemm_xdlops_v2r4.hpp"
-template <typename TInWei,
+template <typename TIn,
+          typename TWei,
          typename TAcc,
          typename TOut,
          typename InLengths,
@@ -23,8 +24,8 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_
    const ConvDilations& conv_dilations,
    const InLeftPads& in_left_pads,
    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TIn>& in_n_hi_wi_c,
-    Tensor<TInWei>& wei_k_y_x_c,
+    Tensor<TWei>& wei_k_y_x_c,
    const Tensor<TOut>& out_n_ho_wo_k,
    GemmKBatchType GemmKBatch,
    ck::index_t nrepeat)
@@ -38,8 +39,8 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
-    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TIn) * in_n_hi_wi_c.mDesc.GetElementSpace());
-    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TWei) * wei_k_y_x_c.mDesc.GetElementSpace());
    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
@@ -176,9 +177,9 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_
    {
        float ave_time = driver_gemm_xdlops_v2r4<
            BlockSize,
-            TInWei,
+            TIn,
            TAcc,
-            TOut,
+            TWei,
            InMemoryDataOperationEnum_t::AtomicAdd,
            decltype(in_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc),
            decltype(out_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc),
@@ -216,9 +217,9 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_
            decltype(in_gemmkbatch_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
            decltype(out_gemmkbatch_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
            false // CAccessOrderMRepeatNRepeat
-            >(static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+            >(static_cast<TIn*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
              static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
-              static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+              static_cast<TWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
              in_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
              out_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
              wei_gemmm_gemmn_grid_desc,

--- a/host/driver_offline/src/conv_wrw_driver_offline.cpp
+++ b/host/driver_offline/src/conv_wrw_driver_offline.cpp
@@ -22,7 +22,7 @@
 #define USE_CONV_WRW_V4R4R2_XDL_NCHW 0
 #define USE_CONV_WRW_V4R4R4_XDL_NHWC 0
 #define USE_CONV_WRW_V4R4R2_XDL_ATOMIC_NCHW 0
-#define USE_CONV_WRW_V4R4R4_XDL_ATOMIC_NHWC 0
+#define USE_CONV_WRW_V4R4R4_XDL_ATOMIC_NHWC 1
 #define USE_CONV_WRW_V4R4R5_XDL_ATOMIC_NHWC 1
 enum ConvBackwardWeightAlgo
@@ -360,6 +360,7 @@ int main(int argc, char* argv[])
        device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_kyxc_nhwk<
            in_data_t,
+            wei_data_t,
            acc_data_t,
            out_data_t>(tmp[I0],
                        tmp[I1],