for binary dumps

64705e7d · Jing Zhang · e9575251 · 64705e7d · 64705e7d · 64705e7d
Commit 64705e7d authored Oct 22, 2021 by Jing Zhang
9 changed files
--- a/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
@@ -315,7 +315,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add

    static constexpr auto NPerBlock = I1;

-    static constexpr FloatAcc alpha = 0.30000001192092896;
+    static constexpr FloatAcc alpha = 0.3;

    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
    {
@@ -360,7 +360,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add

    __host__ __device__ static constexpr bool CalculateHasMainE1BlockLoop()
    {
-        const bool has_main_e1_block_loop = (E1 + E1PerBlock) / (2 * E1PerBlock) > 1;
+        const bool has_main_e1_block_loop = ((E1 + E1PerBlock) / (2 * E1PerBlock)) > 1;

        return has_main_e1_block_loop;
    }
@@ -699,9 +699,9 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
                                            decltype(a_e0_e1_k0_k1_e2_grid_desc),
                                            decltype(a_e0_e1_k0_k1_e2_block_copy_desc),
                                            ABlockTransferSrcAccessOrder,
-                                            Sequence<0, 1, 2, 3, 4>, // ABlockTransferDstAccessOrder
+                                            Sequence<0, 1, 2, 3, 4>,
                                            ABlockTransferSrcVectorDim,
-                                            4, // ABlockTransferDstVectorDim
+                                            4,
                                            ABlockTransferSrcScalarPerVector,
                                            ABlockTransferDstScalarPerVector_E2,
                                            1,

--- a/composable_kernel/include/utility/utility.hpp
+++ b/composable_kernel/include/utility/utility.hpp
@@ -9,6 +9,8 @@ __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }

 __device__ index_t get_block_1d_id() { return blockIdx.x; }

+//__device__ index_t get_block_1d_id() { return gridDim.x - 1 - blockIdx.x; }
+
 } // namespace ck

 #endif
--- a/host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
@@ -71,7 +71,7 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
    bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
    add_n_k0_hox2_wox2_k1_device_buf.ToDevice(add_n_k0_hox2_wox2_k1.mData.data());

-    constexpr index_t InWeiVectorSize = 8;
+    constexpr index_t InWeiVectorSize = C1;

    if(C1 % InWeiVectorSize != 0)
    {
@@ -171,6 +171,11 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
            CThreadTransferDstScalarPerVector_K,
            activ_type>{};

+    std::cerr << "conv_bias_activ_resize_add_input_"
+              << "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
+              << "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_addout_n" << N << "k" << K0
+              << "h" << Ho * 2 << "w" << Wo * 2 << "k" << K1 << std::endl;
+
    for(int i = 0; i < 5; i++)
    {


--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
@@ -64,7 +64,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
    wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
    bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());

-    constexpr index_t InWeiVectorSize = 8;
+    constexpr index_t InWeiVectorSize = C1;

    if(C1 % InWeiVectorSize != 0)
    {
@@ -157,6 +157,11 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
            CThreadTransferDstScalarPerVector_K,
            activ_type>{};

+    std::cerr << "conv_bias_activ_input_"
+              << "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
+              << "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_convout_n" << N << "k" << K0
+              << "h" << Ho << "w" << Wo << "k" << K1 << std::endl;
+
    for(int i = 0; i < 5; i++)
    {


--- a/host/driver_offline/include/device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
@@ -73,7 +73,7 @@ void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1
    bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
    max_n_k0_hx_wx_k1_device_buf.ToDevice(max_n_k0_hx_wx_k1.mData.data());

-    constexpr index_t InWeiVectorSize = 8;
+    constexpr index_t InWeiVectorSize = C1;

    if(C1 % InWeiVectorSize != 0)
    {
@@ -173,6 +173,12 @@ void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1
            CThreadTransferDstScalarPerVector_K,
            activ_type>{};

+    std::cerr << "conv_bias_activ_maxpool_input_"
+              << "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
+              << "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_convout_n" << N << "k" << K0
+              << "h" << Ho << "w" << Wo << "k" << K1 << "_maxpoolout_n" << N << "k" << K0 << "h"
+              << Ho / 2 << "w" << Wo / 2 << "k" << K1 << std::endl;
+
    for(int i = 0; i < 5; i++)
    {


--- a/host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
+++ b/host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
@@ -93,7 +93,7 @@ int main(int argc, char* argv[])
    const bool do_log          = std::stoi(argv[4]);
    const int nrepeat          = std::stoi(argv[5]);

-    constexpr index_t activ_type = 0;
+    constexpr index_t activ_type = 1;

 #if 0
    constexpr auto N             = Number<1>{};
@@ -105,7 +105,7 @@ int main(int argc, char* argv[])
    constexpr auto C1            = Number<8>{};
    constexpr auto K1            = Number<8>{};
    constexpr auto K0            = Number<8>{};
-#elif 1
+#elif 0
    constexpr auto N  = Number<1>{};
    constexpr auto Hi = Number<540>{};
    constexpr auto Wi = Number<960>{};
@@ -125,8 +125,8 @@ int main(int argc, char* argv[])
    constexpr auto C1 = Number<8>{};
    constexpr auto K0 = Number<2>{};
    constexpr auto K1 = Number<8>{};
-#elif 0
-    constexpr auto N  = Number<1>{};
+#elif 1
+    constexpr auto N  = Number<128>{};
    constexpr auto Hi = Number<135>{};
    constexpr auto Wi = Number<240>{};
    constexpr auto Y  = Number<3>{};

--- a/host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
@@ -92,7 +92,20 @@ int main(int argc, char* argv[])
    const bool do_log          = std::stoi(argv[4]);
    const int nrepeat          = std::stoi(argv[5]);

+    constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::Sigmoid;
+    // constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
+
 #if 0
+    constexpr auto N              = Number<1>{};
+    constexpr auto Hi             = Number<1080>{};
+    constexpr auto Wi             = Number<1920>{};
+    constexpr auto Y              = Number<3>{};
+    constexpr auto X              = Number<3>{};
+    constexpr auto C0             = Number<2>{};
+    constexpr auto C1             = Number<8>{};
+    constexpr auto K0             = Number<1>{};
+    constexpr auto K1             = Number<4>{};
+#elif 0
    constexpr auto N              = Number<1>{};
    constexpr auto Hi             = Number<1080>{};
    constexpr auto Wi             = Number<1920>{};
@@ -102,18 +115,28 @@ int main(int argc, char* argv[])
    constexpr auto C1             = Number<8>{};
    constexpr auto K0             = Number<2>{};
    constexpr auto K1             = Number<8>{};
+#elif 0
+    constexpr auto N  = Number<1>{};
+    constexpr auto Hi = Number<1080>{};
+    constexpr auto Wi = Number<1920>{};
+    constexpr auto Y  = Number<1>{};
+    constexpr auto X  = Number<1>{};
+    constexpr auto C0 = Number<2>{};
+    constexpr auto C1 = Number<8>{};
+    constexpr auto K0 = Number<2>{};
+    constexpr auto K1 = Number<8>{};
 #elif 0
    constexpr auto N  = Number<1>{};
    constexpr auto Hi = Number<540>{};
    constexpr auto Wi = Number<960>{};
-    constexpr auto Y              = Number<3>{};
-    constexpr auto X              = Number<3>{};
+    constexpr auto Y  = Number<1>{};
+    constexpr auto X  = Number<1>{};
    constexpr auto C0 = Number<2>{};
    constexpr auto C1 = Number<8>{};
+    constexpr auto K0 = Number<2>{};
    constexpr auto K1 = Number<8>{};
-    constexpr auto K0             = Number<8>{};
 #elif 1
-    constexpr auto N  = Number<1>{};
+    constexpr auto N  = Number<128>{};
    constexpr auto Hi = Number<270>{};
    constexpr auto Wi = Number<480>{};
    constexpr auto Y  = Number<1>{};
@@ -128,6 +151,7 @@ int main(int argc, char* argv[])
    constexpr auto conv_stride_w   = I1;
    constexpr auto conv_dilation_h = I1;
    constexpr auto conv_dilation_w = I1;
+
 #if 0
    constexpr auto in_left_pad_h   = I1;
    constexpr auto in_left_pad_w   = I1;
@@ -260,8 +284,6 @@ int main(int argc, char* argv[])
                          in_right_pads_dev);
    };

-    constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
-
 #if USE_CONV_FWD_V5R1_NCHWC
    if(algo == ConvForwardAlgo::V5R1NCHWC)
    {

--- a/host/driver_offline/src/conv_maxpool_fwd_driver_offline_nchwc.cpp
+++ b/host/driver_offline/src/conv_maxpool_fwd_driver_offline_nchwc.cpp
@@ -96,6 +96,16 @@ int main(int argc, char* argv[])
    constexpr index_t activ_type = 1;

 #if 0
+    constexpr auto N             = Number<1>{};
+    constexpr auto Hi            = Number<1080>{};
+    constexpr auto Wi            = Number<1920>{};
+    constexpr auto Y             = Number<3>{};
+    constexpr auto X             = Number<3>{};
+    constexpr auto C0            = Number<3>{};
+    constexpr auto C1            = Number<4>{};
+    constexpr auto K0            = Number<2>{};
+    constexpr auto K1            = Number<8>{};
+#elif 0
    constexpr auto N  = Number<1>{};
    constexpr auto Hi = Number<1080>{};
    constexpr auto Wi = Number<1920>{};
@@ -116,7 +126,7 @@ int main(int argc, char* argv[])
    constexpr auto K0 = Number<2>{};
    constexpr auto K1 = Number<8>{};
 #elif 1
-    constexpr auto N  = Number<1>{};
+    constexpr auto N  = Number<128>{};
    constexpr auto Hi = Number<270>{};
    constexpr auto Wi = Number<480>{};
    constexpr auto Y  = Number<3>{};

--- a/host/host_tensor/include/host_conv.hpp
+++ b/host/host_tensor/include/host_conv.hpp
@@ -4,7 +4,7 @@
 template <typename T>
 inline auto activ(T v, const ck::index_t activ_type)
 {
-    const T alpha = 0.30000001192092896;
+    const T alpha = 0.3;
    switch(activ_type)
    {
    case 0: return v;
@@ -127,9 +127,8 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in,
    auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
        double v    = 0;
        const int k = k0 * out.mDesc.GetLengths()[4] + k1;
+
        for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
-        {
-            for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
        {
            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
            {
@@ -139,6 +138,8 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in,
                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
                        {
                            v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
                                 static_cast<const double>(wei(k, c0, y, x, c1));
@@ -185,9 +186,9 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,

    auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
        double v = 0;
+        auto k   = k0 * out_host.mDesc.GetLengths()[4] + k1;
+
        for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
-        {
-            for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
        {
            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
            {
@@ -197,17 +198,20 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
                       wi < in.mDesc.GetLengths()[3])
+                    {
+
+                        for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
                        {
                            v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
-                                 static_cast<const double>(
-                                     wei(k0 * out_host.mDesc.GetLengths()[4] + k1, c0, y, x, c1));
+                                 static_cast<const double>(wei(k, c0, y, x, c1));
                        }
                    }
                }
            }
        }

-        v = activ(v, activ_type) + bias(k0, k1);
+        v += bias(k0, k1);
+        v = activ(v, activ_type);

        const int hox2 = ho * 2;
        const int wox2 = wo * 2;
@@ -253,9 +257,9 @@ void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,

    auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
        double v = 0;
+        auto k   = k0 * out_host.mDesc.GetLengths()[4] + k1;
+
        for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
-        {
-            for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
        {
            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
            {
@@ -265,10 +269,11 @@ void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
                        {
                            v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
-                                 static_cast<const double>(
-                                     wei(k0 * out_host.mDesc.GetLengths()[4] + k1, c0, y, x, c1));
+                                 static_cast<const double>(wei(k, c0, y, x, c1));
                        }
                    }
                }