fix tuning prometer for fp16

e7746f82 · ltqin · e790467d · e7746f82
Commit e7746f82 authored Sep 03, 2021 by ltqin
Show whitespace changes
Inline Side-by-side

Showing with 10 additions and 10 deletions

host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp ...ard_weight_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp +10 -10

No files found.
--- a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
@@ -66,13 +66,13 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM = 2;
    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 2;
    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
@@ -120,17 +120,17 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
    constexpr index_t MRepeat = 2;
    constexpr index_t NRepeat = 2;
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
-    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 32, 2>;
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 8;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 4;
-    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 4>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 32, 2>;
-    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 8;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 4;
-    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
 #endif