device_direct_convolution_1.hpp 3.65 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
#pragma once
Chao Liu's avatar
Chao Liu committed
2
#include <unistd.h>
Chao Liu's avatar
Chao Liu committed
3
#include "device.hpp"
4
#include "gridwise_direct_convolution_1.hip.hpp"
Chao Liu's avatar
Chao Liu committed
5
6

template <class T, class InDesc, class WeiDesc, class OutDesc>
Chao Liu's avatar
Chao Liu committed
7
8
9
10
11
12
void device_direct_convolution_1(InDesc,
                                 const Tensor<T>& in,
                                 WeiDesc,
                                 const Tensor<T>& wei,
                                 OutDesc,
                                 Tensor<T>& out,
Chao Liu's avatar
Chao Liu committed
13
                                 index_t nrepeat)
Chao Liu's avatar
Chao Liu committed
14
15
16
17
18
19
20
21
22
23
24
25
{
    std::size_t data_sz = sizeof(T);
    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
    DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
    DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());

    int num_thread = std::thread::hardware_concurrency();

    in_device_buf.ToDevice(in.mData.data());
    wei_device_buf.ToDevice(wei.mData.data());
    out_device_buf.ToDevice(out.mData.data());

Chao Liu's avatar
Chao Liu committed
26
27
28
29
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
30

Chao Liu's avatar
Chao Liu committed
31
32
33
34
    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};
    constexpr auto out_desc = OutDesc{};

Chao Liu's avatar
Chao Liu committed
35
#if 1
Chao Liu's avatar
Chao Liu committed
36
    // 3x3, 34x34
Chao Liu's avatar
Chao Liu committed
37
38
39
40
41
    constexpr index_t NPerBlock  = 2;
    constexpr index_t KPerBlock  = 16;
    constexpr index_t CPerBlock  = 2;
    constexpr index_t HoPerBlock = 4;
    constexpr index_t WoPerBlock = 32;
Chao Liu's avatar
Chao Liu committed
42

Chao Liu's avatar
Chao Liu committed
43
44
45
46
47
    constexpr index_t NPerThread  = 2;
    constexpr index_t KPerThread  = 4;
    constexpr index_t CPerThread  = 2;
    constexpr index_t HoPerThread = 2;
    constexpr index_t WoPerThread = 2;
Chao Liu's avatar
Chao Liu committed
48

Chao Liu's avatar
Chao Liu committed
49
    constexpr index_t BlockSize = 128;
Chao Liu's avatar
Chao Liu committed
50
#endif
Chao Liu's avatar
Chao Liu committed
51

Chao Liu's avatar
Chao Liu committed
52
    constexpr index_t GridSize =
Chao Liu's avatar
Chao Liu committed
53
54
        (out_desc.GetLength(I0) / NPerBlock) * (out_desc.GetLength(I1) / KPerBlock) *
        (out_desc.GetLength(I2) / HoPerBlock) * (out_desc.GetLength(I3) / WoPerBlock);
Chao Liu's avatar
Chao Liu committed
55
56
57

    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);

Chao Liu's avatar
Chao Liu committed
58
    for(index_t i = 0; i < nrepeat; ++i)
Chao Liu's avatar
Chao Liu committed
59
    {
Chao Liu's avatar
Chao Liu committed
60
61
62
63
64
65
66
        float time = launch_kernel(gridwise_direct_convolution_1<T,
                                                                 InDesc,
                                                                 WeiDesc,
                                                                 OutDesc,
                                                                 NPerBlock,
                                                                 KPerBlock,
                                                                 CPerBlock,
Chao Liu's avatar
Chao Liu committed
67
68
                                                                 HoPerBlock,
                                                                 WoPerBlock,
Chao Liu's avatar
Chao Liu committed
69
70
71
                                                                 NPerThread,
                                                                 KPerThread,
                                                                 CPerThread,
Chao Liu's avatar
Chao Liu committed
72
73
                                                                 HoPerThread,
                                                                 WoPerThread,
Chao Liu's avatar
Chao Liu committed
74
75
76
77
78
79
80
                                                                 BlockSize,
                                                                 GridSize>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   static_cast<T*>(in_device_buf.GetDeviceBuffer()),
                                   static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
                                   static_cast<T*>(out_device_buf.GetDeviceBuffer()));
Chao Liu's avatar
Chao Liu committed
81
82
83

        printf("Elapsed time : %f ms\n", time);
        usleep(std::min(time * 1000, float(10000)));
Chao Liu's avatar
Chao Liu committed
84
    }
Chao Liu's avatar
Chao Liu committed
85
86
87

    out_device_buf.FromDevice(out.mData.data());
}