add nc0hwc1

418ca5ee · root · 14f22aa6 · 418ca5ee · 418ca5ee · 418ca5ee
Commit 418ca5ee authored Mar 24, 2021 by root
5 changed files
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
@@ -135,6 +135,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
        constexpr auto KPerThreadSubC = 4;

        static_assert(KPerThread % KPerThreadSubC == 0, "");
+        static_assert(HPerThread % 2 == 0, "");
+        static_assert(WPerThread % 2 == 0, "");

        // thread A, B for GEMM
        constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
@@ -164,16 +166,23 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
 #pragma unroll
            for(index_t k_begin = 0; k_begin < KPerThread; k_begin += KPerThreadSubC)
            {
-
                a_thread_copy.Run(p_a_block +
                                      a_block_mtx.CalculateOffset(make_tuple(e_begin, k_begin)) +
                                      mMyThreadOffsetA,
                                  p_a_thread);

-                threadwise_gemm.Run(
-                    p_a_thread,
-                    p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(e_begin, 0, 0, 0)),
-                    p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(k_begin, 0, 0, 0)));
+                for(index_t h_begin = 0; h_begin < HPerThread; h_begin += 2)
+                {
+
+                    for(index_t w_begin = 0; w_begin < WPerThread; w_begin += 2)
+                    {
+                        threadwise_gemm.Run(p_a_thread,
+                                            p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(
+                                                             e_begin, 0, h_begin, w_begin)),
+                                            p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(
+                                                             k_begin, 0, h_begin, w_begin)));
+                    }
+                }
            }
        }
    }

--- a/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
@@ -54,8 +54,10 @@ struct ThreadwiseGemm_km_kn_mn_v3
        constexpr auto I2 = Number<2>{};
        constexpr auto I3 = Number<3>{};

-        constexpr auto H = BDesc{}.GetLength(I2);
-        constexpr auto W = BDesc{}.GetLength(I3);
+        // constexpr auto H = BDesc{}.GetLength(I2);
+        // constexpr auto W = BDesc{}.GetLength(I3);
+        constexpr auto H = 2;
+        constexpr auto W = 2;

        constexpr auto E = ADesc{}.GetLength(I0);
        constexpr auto K = ADesc{}.GetLength(I1);

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
@@ -29,16 +29,33 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
 {
    using namespace ck;

-    std::cout << "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw"
+    std::cout << "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw"
              << std::endl;

    DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
    DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
    DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());

-    in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto N = OutDesc::GetLengths()[I0];
+    constexpr auto K = OutDesc::GetLengths()[I1];
+    constexpr auto C = WeiDesc::GetLengths()[I1];
+
+    constexpr auto Hi = InDesc::GetLengths()[I2];
+    constexpr auto Wi = InDesc::GetLengths()[I3];
+
+    constexpr auto Ho = OutDesc::GetLengths()[I2];
+    constexpr auto Wo = OutDesc::GetLengths()[I3];
+
+    constexpr auto Y = WeiDesc::GetLengths()[I2];
+    constexpr auto X = WeiDesc::GetLengths()[I3];
+
+    constexpr auto C0 = C / Number<InWeiVectorSize>{};
+    constexpr auto C1 = Number<InWeiVectorSize>{};

 #if 0
    // run-time variables
@@ -55,12 +72,12 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
    const auto in_right_pads  = to_multi_index(InRightPads{});
 #else
    // compile-time variables
-    const auto in_n_c_hi_wi_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
-        sequence_to_tuple_of_number(InDesc::GetLengths()));
-    const auto wei_k_c_y_x_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
-        sequence_to_tuple_of_number(WeiDesc::GetLengths()));
-    const auto out_n_k_ho_wo_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
-        sequence_to_tuple_of_number(OutDesc::GetLengths()));
+    const auto in_n_c0_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, C0, Hi, Wi));
+    const auto wei_k_c0_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C0, Y, X));
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho, Wo));

    const auto conv_strides   = sequence_to_tuple_of_number(ConvStrides{});
    const auto conv_dilations = sequence_to_tuple_of_number(ConvDilations{});
@@ -68,21 +85,43 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
    const auto in_right_pads  = sequence_to_tuple_of_number(InRightPads{});
 #endif

+    Tensor<TInWei> in_n_c0_hi_wi_c1(make_HostTensorDescriptor(
+        make_native_tensor_descriptor_packed(Sequence<N, C0, Hi, Wi, C1>{})));
+    Tensor<TInWei> wei_k_c0_y_x_c1(make_HostTensorDescriptor(
+        make_native_tensor_descriptor_packed(Sequence<K, C0, Y, X, C1>{})));
+
+    auto f_nchw2nc0hwc1 = [&](auto n, auto hi, auto wi, auto c) {
+        in_n_c0_hi_wi_c1(n, c / InWeiVectorSize, hi, wi, c % InWeiVectorSize) =
+            in_n_c_hi_wi(n, c, hi, wi);
+    };
+
+    auto f_kcyx2kc0yxc1 = [&](auto k, auto y, auto x, auto c) {
+        wei_k_c0_y_x_c1(k, c / InWeiVectorSize, y, x, c % InWeiVectorSize) =
+            wei_k_c_y_x(k, c, y, x);
+    };
+
+    make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)();
+    make_ParallelTensorFunctor(f_kcyx2kc0yxc1, K, Y, X, C)();
+
+    in_n_c_hi_wi_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
+    wei_k_c_y_x_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
+    // out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+
    // cdata = 16, BlockSize = 64, 16x64x4
    constexpr index_t BlockSize = 64;

-    constexpr index_t KPerBlock  = 4;
-    constexpr index_t HoPerBlock = 16;
-    constexpr index_t WoPerBlock = 16;
+    constexpr index_t KPerBlock  = 16;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 32;
    constexpr index_t EPerBlock  = 1;

-    constexpr index_t KPerThread  = 4;
+    constexpr index_t KPerThread  = 16;
    constexpr index_t HoPerThread = 2;
    constexpr index_t WoPerThread = 2;
    constexpr index_t EPerThread  = 1;

-    using ABlockTransferThreadSliceLengths_E_K   = Sequence<1, 1>;
-    using ABlockTransferThreadClusterLengths_E_K = Sequence<9, 4>;
+    using ABlockTransferThreadSliceLengths_E_K   = Sequence<9, 1>;
+    using ABlockTransferThreadClusterLengths_E_K = Sequence<4, 16>;

    constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
    constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
@@ -112,8 +151,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
            BThreadTransferSrcScalarPerVector_W,
            CThreadTransferDstScalarPerVector_W>{};

-    conv_driver.Run(wei_k_c_y_x_desc,
-                    in_n_c_hi_wi_desc,
+    conv_driver.Run(wei_k_c0_y_x_desc,
+                    in_n_c0_hi_wi_desc,
                    out_n_k_ho_wo_desc,
                    conv_strides,
                    conv_dilations,

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -62,11 +62,11 @@ int main(int argc, char* argv[])
    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;

-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
+    using LeftPads                   = Sequence<0, 0>;
+    using RightPads                  = Sequence<0, 0>;
 #elif 1
    constexpr index_t N  = 1;
-    constexpr index_t C  = 4;
+    constexpr index_t C  = 16;
    constexpr index_t HI = 1080;
    constexpr index_t WI = 1920;
    constexpr index_t K  = 16;
@@ -630,12 +630,12 @@ int main(int argc, char* argv[])
    print_array("ConvStrides", to_multi_index(ConvStrides{}));
    print_array("ConvDilations", to_multi_index(ConvDilations{}));

-#if 1
+#if 0
    using in_data_t                  = float;
    constexpr index_t in_vector_size = 1;
    using acc_data_t                 = float;
    using out_data_t                 = float;
-#elif 1
+#elif 0
    using in_data_t                  = float;
    constexpr index_t in_vector_size = 1;
    using acc_data_t                 = float;
@@ -741,7 +741,7 @@ int main(int argc, char* argv[])
         LeftPads{},
         RightPads{},
         nrepeat);
-#elif 1
+#elif 0
    device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk<in_data_t,
                                                                         in_vector_size,
                                                                         acc_data_t,

--- a/script/cmake-rocm3.7.sh
+++ b/script/cmake-rocm3.7.sh
@@ -3,7 +3,7 @@ rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles

-MY_PROJECT_SOURCE=../../../
+MY_PROJECT_SOURCE=../
 MY_PROJECT_INSTALL=../install.dir

 cmake                                                                                                                              \