"vscode:/vscode.git/clone" did not exist on "91a7fee03a3973a56cb5f687a6859ef0aaacf15e"
blockwise_direct_convolution.hip.hpp 5.98 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
#pragma once
2
3
4
#include "ConstantTensorDescriptor.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "threadwise_direct_convolution.hip.hpp"
Chao Liu's avatar
Chao Liu committed
5

Chao Liu's avatar
Chao Liu committed
6
template <index_t BlockSize,
Chao Liu's avatar
Chao Liu committed
7
          class Float,
Chao Liu's avatar
Chao Liu committed
8
9
10
          class InBlockDesc,
          class WeiBlockDesc,
          class OutBlockDesc,
Chao Liu's avatar
Chao Liu committed
11
12
13
14
15
          index_t NPerThread,
          index_t KPerThread,
          index_t CPerThread,
          index_t HoPerThread,
          index_t WoPerThread>
Chao Liu's avatar
Chao Liu committed
16
__device__ void blockwise_direct_convolution(InBlockDesc,
Chao Liu's avatar
Chao Liu committed
17
                                             Float* const __restrict__ p_in_block,
Chao Liu's avatar
Chao Liu committed
18
                                             WeiBlockDesc,
Chao Liu's avatar
Chao Liu committed
19
                                             Float* const __restrict__ p_wei_block,
Chao Liu's avatar
Chao Liu committed
20
                                             OutBlockDesc,
Chao Liu's avatar
Chao Liu committed
21
                                             Float* __restrict__ p_out_block)
Chao Liu's avatar
Chao Liu committed
22
{
Chao Liu's avatar
Chao Liu committed
23
24
25
26
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
27
28
29
30
31

    constexpr auto in_block_desc  = InBlockDesc{};
    constexpr auto wei_block_desc = WeiBlockDesc{};
    constexpr auto out_block_desc = OutBlockDesc{};

Chao Liu's avatar
Chao Liu committed
32
33
    constexpr index_t Y = wei_block_desc.GetLength(I2);
    constexpr index_t X = wei_block_desc.GetLength(I3);
Chao Liu's avatar
Chao Liu committed
34

Chao Liu's avatar
Chao Liu committed
35
36
    constexpr index_t InTileSizeH = HoPerThread + Y - 1;
    constexpr index_t InTileSizeW = WoPerThread + X - 1;
Chao Liu's avatar
Chao Liu committed
37

Chao Liu's avatar
Chao Liu committed
38
    // divide thread work
Chao Liu's avatar
Chao Liu committed
39
40
41
42
    constexpr index_t NThreadWork = (out_block_desc.GetLength(I0) + NPerThread - 1) / NPerThread;
    constexpr index_t KThreadWork = (out_block_desc.GetLength(I1) + KPerThread - 1) / KPerThread;
    constexpr index_t YThreadWork = (out_block_desc.GetLength(I2) + HoPerThread - 1) / HoPerThread;
    constexpr index_t XThreadWork = (out_block_desc.GetLength(I3) + WoPerThread - 1) / WoPerThread;
Chao Liu's avatar
Chao Liu committed
43

Chao Liu's avatar
Chao Liu committed
44
#if 0
45
    if(get_thread_local_1d_id() == 0)
Chao Liu's avatar
Chao Liu committed
46
47
48
49
50
51
52
    {
        print_ConstantTensorDescriptor(in_block_desc);
        print_ConstantTensorDescriptor(wei_block_desc);
        print_ConstantTensorDescriptor(out_block_desc);
    }
#endif

Chao Liu's avatar
Chao Liu committed
53
54
    constexpr auto in_thread_desc =
        make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{});
Chao Liu's avatar
Chao Liu committed
55

Chao Liu's avatar
Chao Liu committed
56
    constexpr auto wei_thread_desc =
Chao Liu's avatar
Chao Liu committed
57
        make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, Y, X>{});
Chao Liu's avatar
Chao Liu committed
58

Chao Liu's avatar
Chao Liu committed
59
    constexpr auto out_thread_desc =
Chao Liu's avatar
Chao Liu committed
60
        get_convolution_output_default_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);
Chao Liu's avatar
Chao Liu committed
61

Chao Liu's avatar
Chao Liu committed
62
63
    constexpr auto in_thread_block_desc =
        make_ConstantTensorDescriptor(in_thread_desc.GetLengths(), in_block_desc.GetStrides());
Chao Liu's avatar
Chao Liu committed
64

Chao Liu's avatar
Chao Liu committed
65
66
    constexpr auto wei_thread_block_desc =
        make_ConstantTensorDescriptor(wei_thread_desc.GetLengths(), wei_block_desc.GetStrides());
Chao Liu's avatar
Chao Liu committed
67

Chao Liu's avatar
Chao Liu committed
68
69
    constexpr auto out_thread_block_desc =
        make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_block_desc.GetStrides());
Chao Liu's avatar
Chao Liu committed
70

71
    const index_t thread_id = get_thread_local_1d_id();
Chao Liu's avatar
Chao Liu committed
72

Chao Liu's avatar
Chao Liu committed
73
    for(index_t thread_work_id = thread_id;
Chao Liu's avatar
Chao Liu committed
74
        thread_work_id < NThreadWork * KThreadWork * YThreadWork * XThreadWork;
Chao Liu's avatar
Chao Liu committed
75
76
        thread_work_id += BlockSize)
    {
Chao Liu's avatar
Chao Liu committed
77
78
        index_t itmp             = thread_work_id;
        index_t n_thread_work_id = itmp / (KThreadWork * YThreadWork * XThreadWork);
Chao Liu's avatar
Chao Liu committed
79
        itmp -= n_thread_work_id * (KThreadWork * YThreadWork * XThreadWork);
Chao Liu's avatar
Chao Liu committed
80
        index_t k_thread_work_id = itmp / (YThreadWork * XThreadWork);
Chao Liu's avatar
Chao Liu committed
81
        itmp -= k_thread_work_id * (YThreadWork * XThreadWork);
Chao Liu's avatar
Chao Liu committed
82
83
        index_t y_thread_work_id = itmp / XThreadWork;
        index_t x_thread_work_id = itmp - y_thread_work_id * XThreadWork;
Chao Liu's avatar
Chao Liu committed
84

Chao Liu's avatar
Chao Liu committed
85
86
87
88
        index_t n_thread_data_begin  = n_thread_work_id * NPerThread;
        index_t k_thread_data_begin  = k_thread_work_id * KPerThread;
        index_t ho_thread_data_begin = y_thread_work_id * HoPerThread;
        index_t wo_thread_data_begin = x_thread_work_id * WoPerThread;
Chao Liu's avatar
Chao Liu committed
89

Chao Liu's avatar
Chao Liu committed
90
91
        index_t hi_thread_data_begin = ho_thread_data_begin; // minus padding
        index_t wi_thread_data_begin = wo_thread_data_begin; // minus padding
Chao Liu's avatar
Chao Liu committed
92

Chao Liu's avatar
Chao Liu committed
93
        Float p_out_thread[out_thread_desc.GetElementSpace()];
Chao Liu's avatar
Chao Liu committed
94

Chao Liu's avatar
Chao Liu committed
95
        threadwise_4d_tensor_copy(out_block_desc,
96
97
98
99
                                  p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
                                                                          k_thread_data_begin,
                                                                          ho_thread_data_begin,
                                                                          wo_thread_data_begin),
Chao Liu's avatar
Chao Liu committed
100
                                  out_thread_desc,
Chao Liu's avatar
Chao Liu committed
101
                                  p_out_thread,
Chao Liu's avatar
Chao Liu committed
102
                                  out_thread_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
103

Chao Liu's avatar
Chao Liu committed
104
        for(index_t c_thread_data_begin = 0; c_thread_data_begin < in_block_desc.GetLength(I1);
Chao Liu's avatar
Chao Liu committed
105
            c_thread_data_begin += CPerThread)
Chao Liu's avatar
Chao Liu committed
106
        {
Chao Liu's avatar
Chao Liu committed
107
108
109
            // threadwise convolution
            threadwise_direct_convolution_2(
                in_thread_block_desc,
110
111
112
113
                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
                                                      c_thread_data_begin,
                                                      hi_thread_data_begin,
                                                      wi_thread_data_begin),
Chao Liu's avatar
Chao Liu committed
114
115
116
                wei_thread_block_desc,
                p_wei_block +
                    wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0),
Chao Liu's avatar
Chao Liu committed
117
118
                out_thread_desc,
                p_out_thread);
Chao Liu's avatar
Chao Liu committed
119
        }
Chao Liu's avatar
Chao Liu committed
120
121
122
123

        // copy output into LDS
        threadwise_4d_tensor_copy(out_thread_desc,
                                  p_out_thread,
Chao Liu's avatar
Chao Liu committed
124
                                  out_block_desc,
125
126
127
128
                                  p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
                                                                          k_thread_data_begin,
                                                                          ho_thread_data_begin,
                                                                          wo_thread_data_begin),
Chao Liu's avatar
Chao Liu committed
129
                                  out_thread_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
130
131
    }
}