fixed comments

34321734 · Jing Zhang · 5f11dccc · 34321734 · 34321734 · 34321734
Commit 34321734 authored Apr 07, 2021 by Jing Zhang
4 changed files
--- a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
@@ -180,7 +180,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
                                  Sequence<0, 0, 0, 0, 0>{},
                                  Sequence<0, 0, 0, 0, 0>{}));
-#if 1
        // GEMM
        using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v3<
            BlockSize,
@@ -363,7 +362,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
                      << std::endl;
        }
-#endif
    }
 };
 } // namespace ck

--- a/composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
@@ -133,12 +133,13 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
        constexpr auto EPerBlock = a_block_mtx.GetLength(I0);
        constexpr auto KPerThreadSubC = 4;
-        constexpr auto HPerThreadSubC = 2;
-        constexpr auto WPerThreadSubC = 2;
+        constexpr auto HoPerThreadSubC = 2;
+        constexpr auto WoPerThreadSubC = 2;
        static_assert(KPerThread % KPerThreadSubC == 0, "");
-        static_assert(HPerThread % HPerThreadSubC == 0, "");
+        static_assert(HPerThread % HoPerThreadSubC == 0, "");
-        static_assert(WPerThread % WPerThreadSubC == 0, "");
+        static_assert(WPerThread % WoPerThreadSubC == 0, "");
        // thread A, B for GEMM
        constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
@@ -161,8 +162,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
        constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v3<decltype(a_thread_mtx),
                                                                    decltype(b_thread_mtx),
                                                                    decltype(c_thread_mtx),
-                                                                    HPerThreadSubC,
+                                                                    HoPerThreadSubC,
-                                                                    WPerThreadSubC>{};
+                                                                    WoPerThreadSubC>{};
        // loop over k
 #pragma unroll
        for(index_t e_begin = 0; e_begin < EPerBlock; e_begin += EPerThreadLoop)
@@ -176,10 +177,10 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
                                  p_a_thread);
 #pragma unroll
-                for(index_t h_begin = 0; h_begin < HPerThread; h_begin += HPerThreadSubC)
+                for(index_t h_begin = 0; h_begin < HPerThread; h_begin += HoPerThreadSubC)
                {
 #pragma unroll
-                    for(index_t w_begin = 0; w_begin < WPerThread; w_begin += WPerThreadSubC)
+                    for(index_t w_begin = 0; w_begin < WPerThread; w_begin += WoPerThreadSubC)
                    {
                        threadwise_gemm.Run(p_a_thread,
                                            p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(

--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -36,9 +36,9 @@
 #endif
 // buffer resourse
-#if CK_AMD_GPU_GFX906 || CK_AMD_GPU_GFX908
+#if defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908)
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
-#elif CK_AMD_GPU_GFX1030
+#elif defined(CK_AMD_GPU_GFX1030)
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
 #endif

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
@@ -164,26 +164,18 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
 #endif
    constexpr auto conv_driver =
-        // DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad<
+#if 0
+        DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad<
+#else
        DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad<
-            BlockSize,
+#endif
-            typename vector_type<TInWei, InWeiVectorSize>::type,
+                       BlockSize,
-            TAcc,
+                   typename vector_type<TInWei, InWeiVectorSize>::type, TAcc, TOut, KPerBlock,
-            TOut,
+                   HoPerBlock, WoPerBlock, EPerBlock, KPerThread, HoPerThread, WoPerThread,
-            KPerBlock,
+                   EPerThread, ABlockTransferThreadSliceLengths_E_K,
-            HoPerBlock,
+                   ABlockTransferThreadClusterLengths_E_K, ABlockTransferSrcScalarPerVector_E,
-            WoPerBlock,
+                   ABlockTransferDstScalarPerVector_K, BThreadTransferSrcScalarPerVector_W,
-            EPerBlock,
+                   CThreadTransferDstScalarPerVector_W > {};
-            KPerThread,
-            HoPerThread,
-            WoPerThread,
-            EPerThread,
-            ABlockTransferThreadSliceLengths_E_K,
-            ABlockTransferThreadClusterLengths_E_K,
-            ABlockTransferSrcScalarPerVector_E,
-            ABlockTransferDstScalarPerVector_K,
-            BThreadTransferSrcScalarPerVector_W,
-            CThreadTransferDstScalarPerVector_W>{};
    conv_driver.Run(wei_k_c0_y_x_desc,
                    in_n_c0_hi_wi_desc,