device_implicit_gemm_convolution_nchw_kcsr.cuh 4.15 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
#pragma once
Chao Liu's avatar
Chao Liu committed
2
#include "gridwise_implicit_gemm_convolution_nchw_kcsr.cuh"
Chao Liu's avatar
Chao Liu committed
3
4

template <class T, class InDesc, class WeiDesc, class OutDesc>
Chao Liu's avatar
Chao Liu committed
5
void device_implicit_gemm_convolution_nchw_kcsr(
Chao Liu's avatar
Chao Liu committed
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
    InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
{
    std::size_t data_sz = sizeof(T);
    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
    DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
    DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());

    int num_thread = std::thread::hardware_concurrency();

    in_device_buf.ToDevice(in.mData.data());
    wei_device_buf.ToDevice(wei.mData.data());
    out_device_buf.ToDevice(out.mData.data());

    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};

    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};
    constexpr auto out_desc = OutDesc{};

Chao Liu's avatar
Chao Liu committed
28
#if 0
Chao Liu's avatar
Chao Liu committed
29
30
31
    constexpr unsigned NPerBlock  = 1;
    constexpr unsigned KPerBlock  = 1;
    constexpr unsigned CPerBlock  = 1;
Chao Liu's avatar
Chao Liu committed
32
33
    constexpr unsigned HoPerBlock = 2;
    constexpr unsigned WoPerBlock = 32;
Chao Liu's avatar
Chao Liu committed
34

Chao Liu's avatar
Chao Liu committed
35
36
37
38
    constexpr unsigned KPerThread  = 1;
    constexpr unsigned CPerThread  = 1;
    constexpr unsigned HoPerThread = 2;
    constexpr unsigned WoPerThread = 2;
Chao Liu's avatar
Chao Liu committed
39

Chao Liu's avatar
Chao Liu committed
40
    constexpr unsigned BlockSize = 16;
Chao Liu's avatar
Chao Liu committed
41
42
43
44
45
#elif 1
    constexpr unsigned NPerBlock  = 1;
    constexpr unsigned KPerBlock  = 64;
    constexpr unsigned CPerBlock  = 2;
    constexpr unsigned HoPerBlock = 4;
Chao Liu's avatar
Chao Liu committed
46
47
    constexpr unsigned WoPerBlock = 32;

Chao Liu's avatar
Chao Liu committed
48
49
    constexpr unsigned KPerThread  = 16;
    constexpr unsigned CPerThread  = 1;
Chao Liu's avatar
Chao Liu committed
50
    constexpr unsigned HoPerThread = 2;
Chao Liu's avatar
Chao Liu committed
51
52
53
    constexpr unsigned WoPerThread = 2;

    constexpr unsigned BlockSize = 128;
Chao Liu's avatar
Chao Liu committed
54
#elif 0
Chao Liu's avatar
Chao Liu committed
55
    constexpr unsigned NPerBlock  = 1;
Chao Liu's avatar
Chao Liu committed
56
57
    constexpr unsigned KPerBlock  = 64;
    constexpr unsigned CPerBlock  = 4;
Chao Liu's avatar
Chao Liu committed
58
    constexpr unsigned HoPerBlock = 4;
Chao Liu's avatar
Chao Liu committed
59
60
    constexpr unsigned WoPerBlock = 32;

Chao Liu's avatar
Chao Liu committed
61
    constexpr unsigned KPerThread  = 8;
Chao Liu's avatar
Chao Liu committed
62
    constexpr unsigned CPerThread  = 2;
Chao Liu's avatar
Chao Liu committed
63
    constexpr unsigned HoPerThread = 2;
Chao Liu's avatar
Chao Liu committed
64
    constexpr unsigned WoPerThread = 4;
Chao Liu's avatar
Chao Liu committed
65

Chao Liu's avatar
Chao Liu committed
66
    constexpr unsigned BlockSize = 128;
Chao Liu's avatar
Chao Liu committed
67
68
#endif

Chao Liu's avatar
Chao Liu committed
69
70
71
    constexpr unsigned GridSize =
        (out_desc.GetLength(I0) / NPerBlock) * (out_desc.GetLength(I1) / KPerBlock) *
        (out_desc.GetLength(I2) / HoPerBlock) * (out_desc.GetLength(I3) / WoPerBlock);
Chao Liu's avatar
Chao Liu committed
72
73
74
75
76
77
78
79
80
81
82
83

    dim3 block_dim(BlockSize);
    dim3 grid_dim(GridSize);

    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);

    cudaEvent_t start, stop;
    float elapsedTime;

    cudaEventCreate(&start);
    cudaEventRecord(start, 0);

Chao Liu's avatar
Chao Liu committed
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
    gridwise_implicit_gemm_convolution_nchw_kcsr<GridSize,
                                                 BlockSize,
                                                 T,
                                                 InDesc,
                                                 WeiDesc,
                                                 OutDesc,
                                                 NPerBlock,
                                                 KPerBlock,
                                                 CPerBlock,
                                                 HoPerBlock,
                                                 WoPerBlock,
                                                 KPerThread,
                                                 CPerThread,
                                                 HoPerThread,
                                                 WoPerThread>
        <<<grid_dim, block_dim>>>(InDesc{},
                                  static_cast<T*>(in_device_buf.GetDeviceBuffer()),
                                  WeiDesc{},
                                  static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
                                  OutDesc{},
                                  static_cast<T*>(out_device_buf.GetDeviceBuffer()));
Chao Liu's avatar
Chao Liu committed
105
106
107
108
109
110
111
112
113
114
115

    cudaEventCreate(&stop);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    cudaEventElapsedTime(&elapsedTime, start, stop);
    printf("Elapsed time : %f ms\n", elapsedTime);

    checkCudaErrors(cudaGetLastError());
    out_device_buf.FromDevice(out.mData.data());
}