fixed a bug, and refactored

1de6fd07 · Chao Liu · 1410850e · 1de6fd07 · 1de6fd07 · 1de6fd07
Commit 1de6fd07 authored Jan 24, 2019 by Chao Liu
5 changed files
--- a/driver/conv.cu
+++ b/driver/conv.cu
@@ -8,7 +8,7 @@
 #include "conv_common.cuh"
 #include "device_direct_convolution_1.cuh"
 #include "device_direct_convolution_2.cuh"
-//#include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh"
+#include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh"
 #include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh"
 #include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh"
 #include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh"
@@ -361,7 +361,7 @@ int main()
    constexpr unsigned K  = 1;
    constexpr unsigned S  = 3;
    constexpr unsigned R  = 3;
-#elif 0
+#elif 1
    // 3x3, 34x34
    constexpr unsigned N = 64;
    constexpr unsigned C = 256;
@@ -370,7 +370,7 @@ int main()
    constexpr unsigned K = 64;
    constexpr unsigned S = 3;
    constexpr unsigned R = 3;
-#elif 1
+#elif 0
    // 3x3, 54x54
    constexpr unsigned N  = 64;
    constexpr unsigned C  = 64;
@@ -388,7 +388,7 @@ int main()
    constexpr unsigned K  = 64;
    constexpr unsigned S  = 3;
    constexpr unsigned R  = 3;
-#elif 1
+#elif 0
    // 3x3, 58x58
    constexpr unsigned N  = 64;
    constexpr unsigned C  = 64;
@@ -449,7 +449,7 @@ int main()
    device_direct_convolution_2
 #elif 0
    device_implicit_gemm_convolution_1_nchw_kcsr
-#elif 0
+#elif 1
    device_implicit_gemm_convolution_1_nchw_srck_nkhw
 #elif 1
    device_implicit_gemm_convolution_1_chwn_csrk_khwn

--- a/driver/device_implicit_gemm_convolution_1_nchw_kcsr.cuh
+++ b/driver/device_implicit_gemm_convolution_1_nchw_kcsr.cuh
 #pragma once
 #include "gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh"
+#include <unistd.h>
 template <class T, class InDesc, class WeiDesc, class OutDesc>
-void device_implicit_gemm_convolution_1_nchw_kcsr(
+void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc,
-    InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
+                                                  const Tensor<T>& in,
+                                                  WeiDesc,
+                                                  const Tensor<T>& wei,
+                                                  OutDesc,
+                                                  Tensor<T>& out,
+                                                  unsigned nrepeat)
 {
    std::size_t data_sz = sizeof(T);
    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
@@ -75,40 +81,45 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(
    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-    cudaEvent_t start, stop;
+    for(unsigned i = 0; i < nrepeat; ++i)
-    float elapsedTime;
+    {
+        cudaEvent_t start, stop;
-    cudaEventCreate(&start);
+        float elapsedTime;
-    cudaEventRecord(start, 0);
+        cudaEventCreate(&start);
-    gridwise_implicit_gemm_convolution_1_nchw_kcsr<GridSize,
+        cudaEventRecord(start, 0);
-                                                   BlockSize,
-                                                   T,
+        gridwise_implicit_gemm_convolution_1_nchw_kcsr<GridSize,
-                                                   InDesc,
+                                                       BlockSize,
-                                                   WeiDesc,
+                                                       T,
-                                                   OutDesc,
+                                                       InDesc,
-                                                   NPerBlock,
+                                                       WeiDesc,
-                                                   KPerBlock,
+                                                       OutDesc,
-                                                   CPerBlock,
+                                                       NPerBlock,
-                                                   HoPerBlock,
+                                                       KPerBlock,
-                                                   WoPerBlock,
+                                                       CPerBlock,
-                                                   KPerThread,
+                                                       HoPerBlock,
-                                                   CPerThread,
+                                                       WoPerBlock,
-                                                   HoPerThread,
+                                                       KPerThread,
-                                                   WoPerThread>
+                                                       CPerThread,
-        <<<grid_dim, block_dim>>>(InDesc{},
+                                                       HoPerThread,
-                                  static_cast<T*>(in_device_buf.GetDeviceBuffer()),
+                                                       WoPerThread>
-                                  WeiDesc{},
+            <<<grid_dim, block_dim>>>(InDesc{},
-                                  static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
+                                      static_cast<T*>(in_device_buf.GetDeviceBuffer()),
-                                  OutDesc{},
+                                      WeiDesc{},
-                                  static_cast<T*>(out_device_buf.GetDeviceBuffer()));
+                                      static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
+                                      OutDesc{},
-    cudaEventCreate(&stop);
+                                      static_cast<T*>(out_device_buf.GetDeviceBuffer()));
-    cudaEventRecord(stop, 0);
-    cudaEventSynchronize(stop);
+        cudaEventCreate(&stop);
+        cudaEventRecord(stop, 0);
-    cudaEventElapsedTime(&elapsedTime, start, stop);
+        cudaEventSynchronize(stop);
-    printf("Elapsed time : %f ms\n", elapsedTime);
+        cudaEventElapsedTime(&elapsedTime, start, stop);
+        printf("Elapsed time : %f ms\n", elapsedTime);
+        usleep(10000);
+    }
    checkCudaErrors(cudaGetLastError());
    out_device_buf.FromDevice(out.mData.data());

--- a/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
+++ b/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
@@ -65,20 +65,8 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
    constexpr unsigned WoPerThread = 2;
    constexpr unsigned BlockSize = 16;
-#elif 0
-    constexpr unsigned NPerBlock  = 1;
-    constexpr unsigned KPerBlock  = 64;
-    constexpr unsigned CPerBlock  = 2;
-    constexpr unsigned HoPerBlock = 4;
-    constexpr unsigned WoPerBlock = 32;
-    constexpr unsigned KPerThread  = 8;
-    constexpr unsigned CPerThread  = 1;
-    constexpr unsigned HoPerThread = 2;
-    constexpr unsigned WoPerThread = 4;
-    constexpr unsigned BlockSize = 128;
 #elif 1
+    // for 3x3, 34x34
    constexpr unsigned NPerBlock  = 1;
    constexpr unsigned KPerBlock  = 64;
    constexpr unsigned CPerBlock  = 2;
@@ -92,6 +80,7 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
    constexpr unsigned BlockSize = 128;
 #elif 0
+    // for 3x3, 34x34
    constexpr unsigned NPerBlock  = 2;
    constexpr unsigned KPerBlock  = 64;
    constexpr unsigned CPerBlock  = 2;

--- a/src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh
+++ b/src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh
@@ -104,7 +104,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc,
    const unsigned n_block_data_begin  = n_block_work_id * NPerBlock;
    const unsigned k_block_data_begin  = k_block_work_id * KPerBlock;
    const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
-    const unsigned wo_block_data_begin = w_block_work_id * HoPerBlock;
+    const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
    const unsigned hi_block_data_begin = ho_block_data_begin;
    const unsigned wi_block_data_begin = wo_block_data_begin;

--- a/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
+++ b/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
@@ -70,7 +70,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
    const unsigned n_block_data_begin  = n_block_work_id * NPerBlock;
    const unsigned k_block_data_begin  = k_block_work_id * KPerBlock;
    const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
-    const unsigned wo_block_data_begin = w_block_work_id * HoPerBlock;
+    const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
    const unsigned hi_block_data_begin = ho_block_data_begin;
    const unsigned wi_block_data_begin = wo_block_data_begin;
@@ -162,7 +162,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
    for(unsigned c_block_data_begin = 0; c_block_data_begin < in_nchw_global_desc.GetLength(I1);
        c_block_data_begin += CPerBlock, __syncthreads())
    {
-#if 0
+#if 1
        // input: global mem to LDS,
        //   convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
        blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
@@ -177,7 +177,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
            reorder_chwn_from_nchw);
 #endif
-#if 0
+#if 1
        // weight: global mem to LDS,
        //   format is [S,R,C,K], no conversion needed
        blockwise_wei_copy.run(p_wei_global + wei_srck_global_desc.Get1dIndex(