"tests/benchmarks/bm_perspective_n_points.py" did not exist on "d565032399bde29fcbbad99a75987dbae923bca8"
blockwise_tensor_slice_op.hip.hpp 13 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
#pragma once
Chao Liu's avatar
Chao Liu committed
2
#include "threadwise_tensor_slice_op.hip.hpp"
Chao Liu's avatar
Chao Liu committed
3
4
5
6
7
8
9
10
11
12
13
14

template <index_t BlockSize,
          class Float,
          class SrcDesc,
          class DstDesc,
          class SrcLengths,
          class SrcSubLengths,
          class SrcClusterLengths,
          class MapDst2Src,
          class MapThreadCluster2SrcCluster,
          index_t SrcDataPerRead,
          index_t DstDataPerWrite>
Chao Liu's avatar
Chao Liu committed
15
struct BlockwiseTensorSliceReorderCopy_v3
Chao Liu's avatar
Chao Liu committed
16
17
18
{
    static constexpr index_t nDim = SrcLengths::GetSize();

Chao Liu's avatar
Chao Liu committed
19
20
    index_t mThreadSrcOffset;
    index_t mThreadDstOffset;
Chao Liu's avatar
Chao Liu committed
21

Chao Liu's avatar
Chao Liu committed
22
23
24
    __device__
    BlockwiseTensorSliceReorderCopy_v3(Array<index_t, nDim> src_block_data_multi_id_begin,
                                       Array<index_t, nDim> dst_block_data_multi_id_begin)
Chao Liu's avatar
Chao Liu committed
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
    {
        constexpr auto src_desc = SrcDesc{};
        constexpr auto dst_desc = DstDesc{};

        constexpr auto src_lengths = SrcLengths{};

        constexpr auto map_dst2src = MapDst2Src{};

        constexpr auto src_sub_lengths = SrcSubLengths{};
        constexpr auto dst_sub_lengths = src_sub_lengths.ReorderGivenNew2Old(map_dst2src);

        constexpr auto map_thread_cluster_2_src_cluster = MapThreadCluster2SrcCluster{};

        constexpr auto src_cluster_lengths = SrcClusterLengths{};
        constexpr auto thread_cluster_lengths =
            src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);

42
        constexpr auto thread_cluster_desc =
Chao Liu's avatar
Chao Liu committed
43
            make_ConstantTensorDescriptor_default_rank_packed(thread_cluster_lengths);
Chao Liu's avatar
Chao Liu committed
44
45
46
47
48

        // sanity check: data type
        static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");

        // sanity check: nDim
Chao Liu's avatar
Chao Liu committed
49
50
51
        static_assert(SrcDesc::GetNumOfDimension() == nDim &&
                          DstDesc::GetNumOfDimension() == nDim && SrcLengths::GetSize() == nDim &&
                          SrcSubLengths::GetSize() == nDim &&
Chao Liu's avatar
Chao Liu committed
52
53
54
55
56
57
58
59
60
61
62
                          SrcClusterLengths::GetSize() == nDim && MapDst2Src::GetSize() == nDim &&
                          MapThreadCluster2SrcCluster::GetSize() == nDim,
                      "wrong! nDim is not consistent\n");

        // sanity check: BlockSize
        constexpr index_t num_active_thread = thread_cluster_desc.GetElementSize();

        static_assert(BlockSize >= num_active_thread,
                      "wrong! BlockSize is not big enough for ThreadPerDims!");

        // sanity check: work division
Chao Liu's avatar
Chao Liu committed
63
        static_for<0, nDim, 1>{}([&](auto IDim) {
Chao Liu's avatar
Chao Liu committed
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
            constexpr auto I                  = decltype(IDim){};
            constexpr index_t src_len         = src_lengths.Get(I);
            constexpr index_t src_sub_len     = src_sub_lengths.Get(I);
            constexpr index_t src_cluster_len = src_cluster_lengths.Get(I);
            static_assert(src_len % (src_sub_len * src_cluster_len) == 0,
                          "wrong! cannot evenly divide Src tensor lengths");
        });

        // sanity check: src read
        static_assert(SrcDataPerRead == 1 || SrcDataPerRead == 2 || SrcDataPerRead == 4,
                      "wrong! only support SrcDataPerRead == 1, 2 or 4!\n");

        static_assert(SrcDataPerRead == 1 || src_desc.GetStride(Number<nDim - 1>{}) == 1,
                      "wrong! only support src.stride(nDim-1) == 1 if SrcDataPerRead > 1!\n");

        static_assert(src_sub_lengths.Get(Number<nDim - 1>{}) % SrcDataPerRead == 0,
                      "wrong! src_sub_lengths[nDim-1] % SrcDataPerRead != 0\n");

        static_assert(src_desc.GetStride(Number<nDim - 2>{}) % SrcDataPerRead == 0,
                      "wrong! should satisfy src_desc.stride(nDim-2) % SrcDataPerRead == 0, to "
                      "keep alignment");

        // sanity check: dst write
        static_assert(DstDataPerWrite == 1 || DstDataPerWrite == 2 || DstDataPerWrite == 4,
                      "wrong! only support DstDataPerWrite == 1, 2 or 4!\n");

        static_assert(DstDataPerWrite == 1 || dst_desc.GetStride(Number<nDim - 1>{}) == 1,
                      "wrong! only support dst.stride(nDim-1) == 1 if DstDataPerWrite > 1!\n");

        static_assert(dst_sub_lengths.Get(Number<nDim - 1>{}) % DstDataPerWrite == 0,
                      "wrong! dst_sub_lengths[nDim-1] % DstDataPerWrite != 0\n");

        static_assert(dst_desc.GetStride(Number<nDim - 2>{}) % DstDataPerWrite == 0,
                      "wrong! should satisfy dst_desc.stride(nDim-2) % DstDataPerWrite == 0, to "
                      "keep alignment");

        // start dividing work
        if(BlockSize > num_active_thread)
        {
            if(get_thread_local_1d_id() >= num_active_thread)
            {
                return;
            }
        }

109
110
        const auto thread_multi_id =
            thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
Chao Liu's avatar
Chao Liu committed
111
112
113
114
115
116
117
118
119

        // compiler: thread_multi_id, src_data_multi_id, dst_data_multi_id, will use separate
        // regsiters, or only one copy???
        auto src_data_multi_id =
            reorder_array_given_old2new(thread_multi_id, map_thread_cluster_2_src_cluster);

        static_for<0, nDim, 1>{}([&](auto IDim) {
            constexpr auto I    = decltype(IDim){};
            constexpr index_t i = I.Get();
120
121
            // compiler: will it really compute index here, or be merged with
            // GetOffsetFromMultiIndex and
Chao Liu's avatar
Chao Liu committed
122
123
124
125
            // optimized away???
            src_data_multi_id[i] *= src_sub_lengths.Get(I);
        });

126
127
        // compiler: will it really compute index here, or be merged with GetOffsetFromMultiIndex
        // and
Chao Liu's avatar
Chao Liu committed
128
129
130
        // optimized away???
        const auto dst_data_multi_id = reorder_array_given_new2old(src_data_multi_id, map_dst2src);

Chao Liu's avatar
Chao Liu committed
131
        mThreadSrcOffset =
132
            src_desc.GetOffsetFromMultiIndex(src_data_multi_id + src_block_data_multi_id_begin);
Chao Liu's avatar
Chao Liu committed
133

Chao Liu's avatar
Chao Liu committed
134
        mThreadDstOffset =
135
            dst_desc.GetOffsetFromMultiIndex(dst_data_multi_id + dst_block_data_multi_id_begin);
Chao Liu's avatar
Chao Liu committed
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#if 0
        if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
        {
            print_ConstantTensorDescriptor(thread_cluster_desc, "thread_cluster_desc: ");
        }

        if(get_block_1d_id() == 0)
        {
            printf("id %5u %5u: "
                   "thread_multi_id: %u %u, "
                   "src_block_data_multi_id_begin: %u %u, "
                   "src_data_multi_id: %u %u, "
                   "mThreadSrcOffset %u, mThreadDstOffset %u \n",
                   get_block_1d_id(),
                   get_thread_local_1d_id(),
                   thread_multi_id[0],
                   thread_multi_id[1],
                   src_block_data_multi_id_begin[0],
                   src_block_data_multi_id_begin[1],
                   src_data_multi_id[0],
                   src_data_multi_id[1],
                   mThreadSrcOffset,
                   mThreadDstOffset);
        }
#endif
Chao Liu's avatar
Chao Liu committed
161
162
163
164
165
166
    }

    __device__ static constexpr index_t GetRegisterClipboardSize()
    {
        constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};

Chao Liu's avatar
Chao Liu committed
167
168
        constexpr auto src_data_per_cluster_per_dims =
            thread_sub_tensor_lengths * SrcClusterLengths{};
Chao Liu's avatar
Chao Liu committed
169
170
171
172
173
174

        constexpr auto repeat_lengths =
            transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
                                SrcLengths{},
                                src_data_per_cluster_per_dims);

Chao Liu's avatar
Chao Liu committed
175
        constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
Chao Liu's avatar
Chao Liu committed
176

177
        constexpr auto thread_tensor_desc =
Chao Liu's avatar
Chao Liu committed
178
            make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
Chao Liu's avatar
Chao Liu committed
179
180
181
182
183
184
185
186
187

        return thread_tensor_desc.GetElementSpace();
    }

    __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
                                             Float* __restrict__ p_clipboard) const
    {
        constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};

Chao Liu's avatar
Chao Liu committed
188
189
        constexpr auto src_data_per_cluster_per_dims =
            thread_sub_tensor_lengths * SrcClusterLengths{};
Chao Liu's avatar
Chao Liu committed
190
191
192
193
194
195

        constexpr auto repeat_lengths =
            transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
                                SrcLengths{},
                                src_data_per_cluster_per_dims);

Chao Liu's avatar
Chao Liu committed
196
        constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
Chao Liu's avatar
Chao Liu committed
197

198
        constexpr auto thread_tensor_desc =
Chao Liu's avatar
Chao Liu committed
199
            make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
Chao Liu's avatar
Chao Liu committed
200
201
202
203

        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};

Chao Liu's avatar
Chao Liu committed
204
            constexpr auto src_data_multi_id = repeat_multi_id * src_data_per_cluster_per_dims;
Chao Liu's avatar
Chao Liu committed
205

Chao Liu's avatar
Chao Liu committed
206
            constexpr auto clipboard_data_multi_id = repeat_multi_id * thread_sub_tensor_lengths;
Chao Liu's avatar
Chao Liu committed
207

208
            constexpr index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(src_data_multi_id);
Chao Liu's avatar
Chao Liu committed
209
            constexpr index_t clipboard_offset =
210
                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id);
Chao Liu's avatar
Chao Liu committed
211

Chao Liu's avatar
Chao Liu committed
212
            threadwise_tensor_slice_copy(SrcDesc{},
Chao Liu's avatar
Chao Liu committed
213
                                         p_src + src_offset + mThreadSrcOffset,
Chao Liu's avatar
Chao Liu committed
214
215
216
217
                                         thread_tensor_desc,
                                         p_clipboard + clipboard_offset,
                                         thread_sub_tensor_lengths,
                                         Number<SrcDataPerRead>{});
Chao Liu's avatar
Chao Liu committed
218
219
220
221
222
223
224
225
        });
    }

    __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
                                              Float* __restrict__ p_dst) const
    {
        constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};

Chao Liu's avatar
Chao Liu committed
226
227
        constexpr auto src_data_per_cluster_per_dims =
            thread_sub_tensor_lengths * SrcClusterLengths{};
Chao Liu's avatar
Chao Liu committed
228
229
230
231
232
233

        constexpr auto repeat_lengths =
            transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
                                SrcLengths{},
                                src_data_per_cluster_per_dims);

Chao Liu's avatar
Chao Liu committed
234
        constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
Chao Liu's avatar
Chao Liu committed
235

236
        constexpr auto thread_tensor_desc =
Chao Liu's avatar
Chao Liu committed
237
            make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
Chao Liu's avatar
Chao Liu committed
238
239
240
241

        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};

Chao Liu's avatar
Chao Liu committed
242
            constexpr auto clipboard_data_multi_id = repeat_multi_id * thread_sub_tensor_lengths;
Chao Liu's avatar
Chao Liu committed
243

Chao Liu's avatar
Chao Liu committed
244
            constexpr auto src_data_multi_id = repeat_multi_id * src_data_per_cluster_per_dims;
Chao Liu's avatar
Chao Liu committed
245
246
247
248
249

            // reorder src_data_multi_id to get dst_data_multi_id
            constexpr auto dst_data_multi_id = src_data_multi_id.ReorderGivenNew2Old(MapDst2Src{});

            constexpr index_t clipboard_offset =
250
                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id);
Chao Liu's avatar
Chao Liu committed
251

252
            constexpr index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id);
Chao Liu's avatar
Chao Liu committed
253

Chao Liu's avatar
Chao Liu committed
254
// write in the order of dst
Chao Liu's avatar
Chao Liu committed
255
#if 1
Chao Liu's avatar
Chao Liu committed
256
257
258
259
            threadwise_tensor_slice_copy_reorder_given_dst2src_v2(thread_tensor_desc,
                                                                  p_clipboard + clipboard_offset,
                                                                  DstDesc{},
                                                                  p_dst + dst_offset +
Chao Liu's avatar
Chao Liu committed
260
                                                                      mThreadDstOffset,
Chao Liu's avatar
Chao Liu committed
261
262
                                                                  thread_sub_tensor_lengths,
                                                                  MapDst2Src{});
Chao Liu's avatar
Chao Liu committed
263
#else
Chao Liu's avatar
Chao Liu committed
264
265
266
267
            threadwise_tensor_slice_copy_reorder_given_dst2src_v3(thread_tensor_desc,
                                                                  p_clipboard + clipboard_offset,
                                                                  DstDesc{},
                                                                  p_dst + dst_offset +
Chao Liu's avatar
Chao Liu committed
268
                                                                      mThreadDstOffset,
Chao Liu's avatar
Chao Liu committed
269
270
271
                                                                  thread_sub_tensor_lengths,
                                                                  MapDst2Src{},
                                                                  Number<DstDataPerWrite>{});
Chao Liu's avatar
Chao Liu committed
272
273
274
275
276
277
278
279
280
281
282
#endif
        });
    }

    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
    {
        Float p_clipboard[GetRegisterClipboardSize()];

        RunLoadRegisterClipboard(p_src, p_clipboard);
        RunStoreRegisterClipboard(p_clipboard, p_dst);
    }
Chao Liu's avatar
Chao Liu committed
283
284
285
286
287
288
289
290
291
292
293
294
295

    // this function doesn't do santiy check on whether the slicing window is out of the boundary
    // of the tensor being sliced
    template <index_t IDim_, index_t StepSize, bool PositiveDirection>
    __device__ void MoveSlicingWindowOnSourceTensor(
        Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
    {
        constexpr auto IDim = Number<IDim_>{};

        static_if<PositiveDirection>{}([&](auto fwd) {
            mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim);
        }).Else([&](auto fwd) { mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim); });
    }
Chao Liu's avatar
Chao Liu committed
296
};