clean pu

f64fab12 · Chao Liu · 3317bfe2 · f64fab12 · f64fab12 · f64fab12
Commit f64fab12 authored Apr 18, 2020 by Chao Liu
6 changed files
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -182,7 +182,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
        const index_t k_block_data_on_global = block_work_id[1] * KPerBlock;
 #endif

-
        // input tensor
        //     global tensor in global memory
        constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(

--- a/composable_kernel/include/utility/float_type.nvidia.hpp.in
+++ b/composable_kernel/include/utility/float_type.nvidia.hpp.in
@@ -176,7 +176,6 @@ struct inner_product_with_conversion
        }
        return acc;
    }
-
 };

 } // namespace ck

--- a/driver/include/device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
-1111gma once
+#pragma once
 #include <unistd.h>
 #include "device.hpp"
 #include "tensor.hpp"

--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -170,7 +170,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,

    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 0
+#elif 1
    // cdata = 64, BlockSize = 256, 128x128x16
    constexpr index_t BlockSize = 256;

@@ -184,10 +184,10 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    constexpr index_t GemmNPerThreadSubC = 4;
    constexpr index_t GemmKPerThreadLoop = 1;

-    constexpr index_t GemmMLevel0Cluster = 2;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 8;
-    constexpr index_t GemmNLevel1Cluster = 8;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;

    constexpr index_t GemmDataPerReadA = 4;
    constexpr index_t GemmDataPerReadB = 4;
@@ -561,7 +561,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,

    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 1;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 1
+#elif 0
    // cdata = 64, BlockSize = 64, 32x128x3
    constexpr index_t BlockSize = 64;

@@ -810,6 +810,9 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
            WeiBlockCopySrcDataPerRead_E,
            WeiBlockCopyDstDataPerWrite_K>{};

+    // warm up
+    std::cout << "Warn up runs..." << std::endl;
+
    for(index_t i = 0; i < 10; ++i)
    {
        float time =
@@ -822,14 +825,11 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
                                   static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));

-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-    }
+        float perf = (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                     (std::size_t(1000) * 1000 * 1000) / time;

-    // warm up
-    printf("Warn up running %d times...\n", nrepeat);
+        std::cout << "Elapsed time : " << time << " ms, " << perf << " TFlop/s" << std::endl;
+    }

    for(index_t i = 0; i < nrepeat; ++i)
    {
@@ -845,8 +845,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,

    printf("Start running %d times...\n", nrepeat);

-    cudaDeviceSynchronize();
-    auto start = std::chrono::steady_clock::now();
+    KernelTimer timer;
+    timer.Start();

    for(index_t i = 0; i < nrepeat; ++i)
    {
@@ -860,15 +860,14 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
                      static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
    }

-    cudaDeviceSynchronize();
-    auto end = std::chrono::steady_clock::now();
+    timer.End();
+
+    float ave_time = timer.GetElapsedTime() / nrepeat;

-    float ave_time = std::chrono::duration<float, std::milli>(end - start).count() / nrepeat;
+    float perf = (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                 (std::size_t(1000) * 1000 * 1000) / ave_time;

-    printf("Average elapsed time : %f ms, %f TFlop/s\n",
-           ave_time,
-           (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-               (std::size_t(1000) * 1000 * 1000) / ave_time);
+    std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;

    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
 }
--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -253,7 +253,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 4;

    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 4;
-#elif 0
+#elif 1
    // cdata = 64, BlockSize = 256, 128x128x16
    // GemmBBlockCopySrcDataPerRead_GemmN = 4
    // GemmCThreadCopyDstDataPerWrite_GemmN1 = 4
@@ -791,7 +791,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 2;

    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
-#elif 1
+#elif 0
    // cdata = 64, BlockSize = 64, 32x128x3
    constexpr index_t BlockSize = 64;

@@ -1000,6 +1000,9 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
        GemmBBlockCopyDstDataPerWrite_GemmN,
        GemmCThreadCopyDstDataPerWrite_GemmN1>{};

+    // warm up
+    std::cout << "Warn up runs..." << std::endl;
+
    for(index_t i = 0; i < 10; ++i)
    {
        float time =
@@ -1012,14 +1015,11 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
                                   static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));

-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-    }
+        float perf = (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                     (std::size_t(1000) * 1000 * 1000) / time;

-    // warm up
-    printf("Warn up running %d times...\n", nrepeat);
+        std::cout << "Elapsed time : " << time << " ms, " << perf << " TFlop/s" << std::endl;
+    }

    for(index_t i = 0; i < nrepeat; ++i)
    {
@@ -1035,8 +1035,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,

    printf("Start running %d times...\n", nrepeat);

-    cudaDeviceSynchronize();
-    auto start = std::chrono::steady_clock::now();
+    KernelTimer timer;
+    timer.Start();

    for(index_t i = 0; i < nrepeat; ++i)
    {
@@ -1050,15 +1050,14 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
                      static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
    }

-    cudaDeviceSynchronize();
-    auto end = std::chrono::steady_clock::now();
+    timer.End();
+
+    float ave_time = timer.GetElapsedTime() / nrepeat;

-    float ave_time = std::chrono::duration<float, std::milli>(end - start).count() / nrepeat;
+    float perf = (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                 (std::size_t(1000) * 1000 * 1000) / ave_time;

-    printf("Average elapsed time : %f ms, %f TFlop/s\n",
-           ave_time,
-           (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-               (std::size_t(1000) * 1000 * 1000) / ave_time);
+    std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;

    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
 }
--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
-#include <iostream>
+
 #include <numeric>
 #include <initializer_list>
 #include <cstdlib>
@@ -20,8 +20,8 @@
 //#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v4r4_xdlops_fp16_nchw_kcyx_nkhw.hpp"
+//#include "device_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
+//#include "device_convolution_implicit_gemm_v4r4_xdlops_fp16_nchw_kcyx_nkhw.hpp"

 int main(int argc, char* argv[])
 {
@@ -132,7 +132,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<0, 3>;
    using RightPads = Sequence<0, 3>;
-#elif 1
+#elif 0
    // 3x3, 299x299 stride=2
    constexpr index_t N  = 128;
    constexpr index_t C  = 1024;
@@ -209,10 +209,10 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
-#elif 0
+#elif 1
    // 3x3, 35x35, stride 2
    constexpr index_t N  = 128;
-    constexpr index_t C  = 384;
+    constexpr index_t C  = 288;
    constexpr index_t HI = 35;
    constexpr index_t WI = 35;
    constexpr index_t K  = 384;
@@ -269,7 +269,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<1, 0>;
    using RightPads = Sequence<1, 0>;
-#elif 1
+#elif 0
    // 3x3, 147x147
    constexpr index_t N  = 128;
    constexpr index_t C  = 64;
@@ -300,7 +300,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<3, 0>;
    using RightPads = Sequence<3, 0>;
-#elif 1
+#elif 0
    // 3x3, 73x73
    constexpr index_t N  = 128;
    constexpr index_t C  = 64;
@@ -525,8 +525,8 @@ int main(int argc, char* argv[])
    print_sequence("ConvStrides", ConvStrides{});
    print_sequence("ConvDilations", ConvDilations{});

-    using in_data_t  = half;
-    using out_data_t = half;
+    using in_data_t  = float;
+    using out_data_t = float;
    Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
    Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
    Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
 #elif 0
    device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
        (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
+#elif 1
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
                                                         wei_kcyx_desc,
@@ -604,7 +604,7 @@ int main(int argc, char* argv[])
                                                         LeftPads{},
                                                         RightPads{},
                                                         nrepeat);
-#elif 0
+#elif 1
    device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
                                                         wei_kcyx_desc,