perform eval

961e6810 · root · 4b456610 · 961e6810 · 961e6810 · 961e6810
Commit 961e6810 authored Mar 18, 2021 by root
5 changed files
--- a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
@@ -225,6 +225,33 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad

            for(index_t j = 0; j < nrepeat; ++j)
            {
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_gemmk_gemmm_global_desc),
+                                               const Float*,
+                                               decltype(in_gemmk_n_ho_wo_global_desc),
+                                               const Float*,
+                                               decltype(out_gemmm_n_ho_wo_global_desc),
+                                               Float*,
+                                               integral_constant<bool, true>,
+                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_gemmk_gemmm_global_desc,
+                                  p_wei_global,
+                                  in_gemmk_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_gemmm_n_ho_wo_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, false>{});
+                }
+#if 0
                if(has_main_k_block_loop && has_double_tail_k_block_loop)
                {
                    const auto kernel =
@@ -333,6 +360,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
                                  integral_constant<bool, false>{},
                                  integral_constant<bool, false>{});
                }
+#endif
            }

            timer.End();

--- a/composable_kernel/include/gridwise_operation_wrapper.hpp
+++ b/composable_kernel/include/gridwise_operation_wrapper.hpp
@@ -4,7 +4,7 @@
 template <typename GridwiseOp, typename... Xs>
 __global__ void
 #if 1
-    __launch_bounds__(64, 2)
+    __launch_bounds__(256, 2)
 #endif
        run_gridwise_operation(Xs... xs)
 {

--- a/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
@@ -76,90 +76,10 @@ struct ThreadwiseGemm_km_kn_mn_v3
        });
    }

-#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
-    template <typename FloatA, typename FloatB, typename FloatC>
-    __device__ static void Run_amd_asm(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
-    {
-        static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
-                          CDesc::IsKnownAtCompileTime(),
-                      "wrong! Desc should be known at compile-time");
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
-        constexpr auto M = CDesc{}.GetLength(I0);
-        constexpr auto N = CDesc{}.GetLength(I1);
-        constexpr auto K = ADesc{}.GetLength(I0);
-
-        static_assert(N == 4 || N == 2, "wrong! this config not supported by asm yet");
-
-        static_for<0, K, 1>{}([&](auto k) {
-            static_for<0, M, 1>{}([&](auto m) {
-                constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(k, m));
-
-                if constexpr(N == 2)
-                {
-                    constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(k, I0));
-                    constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(k, I1));
-
-                    constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(m, I0));
-                    constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(m, I1));
-
-                    amd_assembly_outer_product_1x2(p_a[a_offset],
-                                                   p_b[b_offset_0],
-                                                   p_b[b_offset_1],
-                                                   p_c[c_offset_0],
-                                                   p_c[c_offset_1]);
-                }
-                else if constexpr(N == 4)
-                {
-                    constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(k, I0));
-                    constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(k, I1));
-                    constexpr auto b_offset_2 = BDesc{}.CalculateOffset(make_tuple(k, I2));
-                    constexpr auto b_offset_3 = BDesc{}.CalculateOffset(make_tuple(k, I3));
-
-                    constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(m, I0));
-                    constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(m, I1));
-                    constexpr auto c_offset_2 = CDesc{}.CalculateOffset(make_tuple(m, I2));
-                    constexpr auto c_offset_3 = CDesc{}.CalculateOffset(make_tuple(m, I3));
-
-                    amd_assembly_outer_product_1x4(p_a[a_offset],
-                                                   p_b[b_offset_0],
-                                                   p_b[b_offset_1],
-                                                   p_b[b_offset_2],
-                                                   p_b[b_offset_3],
-                                                   p_c[c_offset_0],
-                                                   p_c[c_offset_1],
-                                                   p_c[c_offset_2],
-                                                   p_c[c_offset_3]);
-                }
-            });
-        });
-    }
-#endif
-
    template <typename FloatA, typename FloatB, typename FloatC>
    __device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
    {
-#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
-        constexpr bool has_amd_asm = is_same<FloatC, float>{} &&
-                                     ((is_same<FloatA, float>{} && is_same<FloatB, float>{}) ||
-                                      (is_same<FloatA, half2_t>{} && is_same<FloatB, half2_t>{}) ||
-                                      (is_same<FloatA, half4_t>{} && is_same<FloatB, half4_t>{}));
-
-        if constexpr(has_amd_asm)
-        {
-            Run_amd_asm(p_a, p_b, p_c);
-        }
-        else
-        {
-            Run_source(p_a, p_b, p_c);
-        }
-#else
        Run_source(p_a, p_b, p_c);
-#endif
    }
 };


--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
@@ -68,14 +68,14 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
 #endif

    // cdata = 16, BlockSize = 64, 16x64x4
-    constexpr index_t BlockSize = 64;
+    constexpr index_t BlockSize = 256;

    constexpr index_t KPerBlock   = 16;
    constexpr index_t HPerBlock   = 16;
    constexpr index_t WPerBlock   = 16;
    constexpr index_t CYXPerBlock = 4;

-    constexpr index_t KPerThread   = 16;
+    constexpr index_t KPerThread   = 4;
    constexpr index_t HPerThread   = 2;
    constexpr index_t WPerThread   = 2;
    constexpr index_t CYXPerThread = 4;

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -631,7 +631,7 @@ int main(int argc, char* argv[])
    print_array("ConvStrides", to_multi_index(ConvStrides{}));
    print_array("ConvDilations", to_multi_index(ConvDilations{}));

-#if 0
+#if 1
    using in_data_t                  = float;
    constexpr index_t in_vector_size = 1;
    using out_data_t                 = float;
@@ -756,22 +756,6 @@ int main(int argc, char* argv[])
                                                                         LeftPads{},
                                                                         RightPads{},
                                                                         nrepeat);
-#elif 1
-    device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk<in_data_t,
-                                                                         in_vector_size,
-                                                                         acc_data_t,
-                                                                         out_data_t>(
-        in_nchw_desc,
-        in_nchw,
-        wei_kcyx_desc,
-        wei_kcyx,
-        out_nkhw_desc,
-        out_nkhw_device,
-        ConvStrides{},
-        ConvDilations{},
-        LeftPads{},
-        RightPads{},
-        nrepeat);
 #endif

    if(do_verification)