device_direct_convolution_2.hpp 4.28 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
#pragma once
Chao Liu's avatar
Chao Liu committed
2
#include <unistd.h>
Chao Liu's avatar
Chao Liu committed
3
#include "device.hpp"
4
#include "gridwise_direct_convolution_2.hip.hpp"
Chao Liu's avatar
Chao Liu committed
5
6

template <class T, class InDesc, class WeiDesc, class OutDesc>
Chao Liu's avatar
Chao Liu committed
7
8
9
10
11
12
13
void device_direct_convolution_2(InDesc,
                                 const Tensor<T>& in,
                                 WeiDesc,
                                 const Tensor<T>& wei,
                                 OutDesc,
                                 Tensor<T>& out,
                                 unsigned nrepeat)
Chao Liu's avatar
Chao Liu committed
14
15
16
17
18
19
20
21
22
23
24
25
{
    std::size_t data_sz = sizeof(T);
    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
    DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
    DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());

    int num_thread = std::thread::hardware_concurrency();

    in_device_buf.ToDevice(in.mData.data());
    wei_device_buf.ToDevice(wei.mData.data());
    out_device_buf.ToDevice(out.mData.data());

Chao Liu's avatar
Chao Liu committed
26
27
28
29
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
30

Chao Liu's avatar
Chao Liu committed
31
32
33
34
    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};
    constexpr auto out_desc = OutDesc{};

Chao Liu's avatar
Chao Liu committed
35
#if 1
Chao Liu's avatar
Chao Liu committed
36
    // 3x3, 34x34, 128 thread
Chao Liu's avatar
Chao Liu committed
37
38
    constexpr unsigned OutTileSizeH = 2;
    constexpr unsigned OutTileSizeW = 2;
Chao Liu's avatar
rename  
Chao Liu committed
39
40
    constexpr unsigned NPerBlock    = 2;
    constexpr unsigned KPerBlock    = 32;
41
    constexpr unsigned CPerBlock    = 4;
Chao Liu's avatar
rename  
Chao Liu committed
42
    constexpr unsigned YPerBlock    = 1;
Chao Liu's avatar
Chao Liu committed
43
44
    constexpr unsigned XPerBlock    = 16;

Chao Liu's avatar
rename  
Chao Liu committed
45
46
47
48
    constexpr unsigned NPerThread = 2;
    constexpr unsigned KPerThread = 4;
    constexpr unsigned CPerThread = 2;

Chao Liu's avatar
Chao Liu committed
49
    constexpr unsigned BlockSize = 128;
Chao Liu's avatar
Chao Liu committed
50
#elif 0
Chao Liu's avatar
Chao Liu committed
51
    // 3x3, 34x34, 256 thread
Chao Liu's avatar
Chao Liu committed
52
53
54
55
56
57
58
59
60
61
62
63
64
65
    constexpr unsigned OutTileSizeH = 2;
    constexpr unsigned OutTileSizeW = 2;
    constexpr unsigned NPerBlock    = 2;
    constexpr unsigned KPerBlock    = 32;
    constexpr unsigned CPerBlock    = 4;
    constexpr unsigned YPerBlock    = 1;
    constexpr unsigned XPerBlock    = 32;

    constexpr unsigned NPerThread = 2;
    constexpr unsigned KPerThread = 4;
    constexpr unsigned CPerThread = 2;

    constexpr unsigned BlockSize = 256;
#endif
Chao Liu's avatar
Chao Liu committed
66
67
68
69
70
71
72
73

    constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) *
                                  (out_desc.GetLength(I1) / KPerBlock) *
                                  (out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) *
                                  (out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock));

    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);

Chao Liu's avatar
Chao Liu committed
74
75
    for(unsigned i = 0; i < nrepeat; ++i)
    {
Chao Liu's avatar
Chao Liu committed
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
        float time = launch_kernel(gridwise_direct_convolution_2<T,
                                                                 InDesc,
                                                                 WeiDesc,
                                                                 OutDesc,
                                                                 OutTileSizeH,
                                                                 OutTileSizeW,
                                                                 NPerBlock,
                                                                 KPerBlock,
                                                                 CPerBlock,
                                                                 YPerBlock,
                                                                 XPerBlock,
                                                                 NPerThread,
                                                                 KPerThread,
                                                                 CPerThread,
                                                                 BlockSize,
                                                                 GridSize>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   static_cast<T*>(in_device_buf.GetDeviceBuffer()),
                                   static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
                                   static_cast<T*>(out_device_buf.GetDeviceBuffer()));
Chao Liu's avatar
Chao Liu committed
97
98
99

        printf("Elapsed time : %f ms\n", time);
        usleep(std::min(time * 1000, float(10000)));
Chao Liu's avatar
Chao Liu committed
100
    }
Chao Liu's avatar
Chao Liu committed
101
102
103

    out_device_buf.FromDevice(out.mData.data());
}