device_direct_convolution_1.hpp 3.67 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
#pragma once
Chao Liu's avatar
Chao Liu committed
2
#include <unistd.h>
Chao Liu's avatar
Chao Liu committed
3
#include "device.hpp"
4
#include "gridwise_direct_convolution_1.hip.hpp"
Chao Liu's avatar
Chao Liu committed
5
6

template <class T, class InDesc, class WeiDesc, class OutDesc>
Chao Liu's avatar
Chao Liu committed
7
8
9
10
11
12
13
void device_direct_convolution_1(InDesc,
                                 const Tensor<T>& in,
                                 WeiDesc,
                                 const Tensor<T>& wei,
                                 OutDesc,
                                 Tensor<T>& out,
                                 unsigned nrepeat)
Chao Liu's avatar
Chao Liu committed
14
15
16
17
18
19
20
21
22
23
24
25
{
    std::size_t data_sz = sizeof(T);
    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
    DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
    DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());

    int num_thread = std::thread::hardware_concurrency();

    in_device_buf.ToDevice(in.mData.data());
    wei_device_buf.ToDevice(wei.mData.data());
    out_device_buf.ToDevice(out.mData.data());

Chao Liu's avatar
Chao Liu committed
26
27
28
29
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
30

Chao Liu's avatar
Chao Liu committed
31
32
33
34
    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};
    constexpr auto out_desc = OutDesc{};

Chao Liu's avatar
Chao Liu committed
35
#if 1
Chao Liu's avatar
Chao Liu committed
36
    // 3x3, 34x34
Chao Liu's avatar
Chao Liu committed
37
38
39
40
41
    constexpr unsigned NPerBlock  = 2;
    constexpr unsigned KPerBlock  = 16;
    constexpr unsigned CPerBlock  = 2;
    constexpr unsigned HoPerBlock = 4;
    constexpr unsigned WoPerBlock = 32;
Chao Liu's avatar
Chao Liu committed
42

Chao Liu's avatar
Chao Liu committed
43
44
45
46
47
    constexpr unsigned NPerThread  = 2;
    constexpr unsigned KPerThread  = 4;
    constexpr unsigned CPerThread  = 2;
    constexpr unsigned HoPerThread = 2;
    constexpr unsigned WoPerThread = 2;
Chao Liu's avatar
Chao Liu committed
48
49

    constexpr unsigned BlockSize = 128;
Chao Liu's avatar
Chao Liu committed
50
#endif
Chao Liu's avatar
Chao Liu committed
51

Chao Liu's avatar
Chao Liu committed
52
53
54
    constexpr unsigned GridSize =
        (out_desc.GetLength(I0) / NPerBlock) * (out_desc.GetLength(I1) / KPerBlock) *
        (out_desc.GetLength(I2) / HoPerBlock) * (out_desc.GetLength(I3) / WoPerBlock);
Chao Liu's avatar
Chao Liu committed
55
56
57

    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);

Chao Liu's avatar
Chao Liu committed
58
59
    for(unsigned i = 0; i < nrepeat; ++i)
    {
Chao Liu's avatar
Chao Liu committed
60
61
62
63
64
65
66
        float time = launch_kernel(gridwise_direct_convolution_1<T,
                                                                 InDesc,
                                                                 WeiDesc,
                                                                 OutDesc,
                                                                 NPerBlock,
                                                                 KPerBlock,
                                                                 CPerBlock,
Chao Liu's avatar
Chao Liu committed
67
68
                                                                 HoPerBlock,
                                                                 WoPerBlock,
Chao Liu's avatar
Chao Liu committed
69
70
71
                                                                 NPerThread,
                                                                 KPerThread,
                                                                 CPerThread,
Chao Liu's avatar
Chao Liu committed
72
73
                                                                 HoPerThread,
                                                                 WoPerThread,
Chao Liu's avatar
Chao Liu committed
74
75
76
77
78
79
80
                                                                 BlockSize,
                                                                 GridSize>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   static_cast<T*>(in_device_buf.GetDeviceBuffer()),
                                   static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
                                   static_cast<T*>(out_device_buf.GetDeviceBuffer()));
Chao Liu's avatar
Chao Liu committed
81
82
83

        printf("Elapsed time : %f ms\n", time);
        usleep(std::min(time * 1000, float(10000)));
Chao Liu's avatar
Chao Liu committed
84
    }
Chao Liu's avatar
Chao Liu committed
85
86
87

    out_device_buf.FromDevice(out.mData.data());
}