"tests/vscode:/vscode.git/clone" did not exist on "51625eda4666c43827747082cf7e8929f6bdc658"
amd_buffer_addressing.hpp 37.1 KB
Newer Older
Chao Liu's avatar
tidy  
Chao Liu committed
1
2
#ifndef CK_AMD_BUFFER_ADDRESSING_HPP
#define CK_AMD_BUFFER_ADDRESSING_HPP
Chao Liu's avatar
Chao Liu committed
3

4
#include "data_type.hpp"
Chao Liu's avatar
Chao Liu committed
5
6
7
8

namespace ck {

template <typename T>
Chao Liu's avatar
tidy  
Chao Liu committed
9
union BufferResource
Chao Liu's avatar
Chao Liu committed
10
11
12
{
    // 128 bit SGPRs to supply buffer resource in buffer instructions
    // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
13
    int32x4_t content;
14
15
16
    StaticallyIndexedArray<T*, 2> address;
    StaticallyIndexedArray<int32_t, 4> range;
    StaticallyIndexedArray<int32_t, 4> config;
Chao Liu's avatar
Chao Liu committed
17
18
19
};

template <typename T>
20
__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_size)
Chao Liu's avatar
Chao Liu committed
21
{
Chao Liu's avatar
tidy  
Chao Liu committed
22
    BufferResource<T> wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
23
24

    // wavewise base address (64 bit)
25
    wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
Chao Liu's avatar
Chao Liu committed
26
    // wavewise range (32 bit)
27
    wave_buffer_resource.range(Number<2>{}) = element_space_size * sizeof(T);
Chao Liu's avatar
Chao Liu committed
28
    // wavewise setting (32 bit)
29
    wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
30

31
    return wave_buffer_resource.content;
Chao Liu's avatar
Chao Liu committed
32
33
34
35
}

// load
__device__ int8_t
36
37
38
39
llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
                               index_t voffset,
                               index_t soffset,
                               index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8");
40
41

__device__ int8x2_t
42
43
44
45
llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8");
46
47

__device__ int8x4_t
48
49
50
51
llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8");
52

Chao Liu's avatar
Chao Liu committed
53
__device__ int16_t
54
55
56
57
llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc,
                                index_t voffset,
                                index_t soffset,
                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
Chao Liu's avatar
Chao Liu committed
58
__device__ int32_t
59
60
61
62
llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc,
                                index_t voffset,
                                index_t soffset,
                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
Chao Liu's avatar
Chao Liu committed
63
64

__device__ int32x2_t
65
66
67
68
llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
Chao Liu's avatar
Chao Liu committed
69
70

__device__ int32x4_t
71
72
73
74
llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
75
76
// half
__device__ half_t
77
78
79
80
llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");
81
82

__device__ half2_t
83
84
85
86
llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16");
87
88

__device__ half4_t
89
90
91
92
llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16");
Chao Liu's avatar
Chao Liu committed
93

94
// float
Chao Liu's avatar
Chao Liu committed
95
__device__ float
96
97
98
99
llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32");
Chao Liu's avatar
Chao Liu committed
100
101

__device__ float2_t
102
103
104
105
llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32");
Chao Liu's avatar
Chao Liu committed
106
107

__device__ float4_t
108
109
110
111
llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32");
Chao Liu's avatar
Chao Liu committed
112
113
114

// store
__device__ void
115
116
117
118
119
120
121
122
llvm_amdgcn_raw_buffer_store_i8(int8_t vdata,
                                int32x4_t rsrc,
                                index_t voffset,
                                index_t soffset,
                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8");

__device__ void
llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata,
Chao Liu's avatar
Chao Liu committed
123
124
125
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
126
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8");
Chao Liu's avatar
Chao Liu committed
127

128
__device__ void
129
130
131
132
133
llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata,
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8");
134
135

__device__ void
136
137
138
139
140
141
142
143
144
145
146
147
llvm_amdgcn_raw_buffer_store_i16(int16_t vdata,
                                 int32x4_t rsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");

__device__ void
llvm_amdgcn_raw_buffer_store_i32(int32_t vdata,
                                 int32x4_t rsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32");
148

Chao Liu's avatar
Chao Liu committed
149
__device__ void
150
llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata,
Chao Liu's avatar
Chao Liu committed
151
152
153
                                   int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
154
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32");
Chao Liu's avatar
Chao Liu committed
155
156

__device__ void
157
llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata,
Chao Liu's avatar
Chao Liu committed
158
159
160
                                   int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
161
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
Chao Liu's avatar
Chao Liu committed
162

163
// half
Chao Liu's avatar
Chao Liu committed
164
__device__ void
165
166
167
168
169
llvm_amdgcn_raw_buffer_store_fp16(half_t vdata,
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
Chao Liu's avatar
Chao Liu committed
170
171

__device__ void
172
173
174
175
176
llvm_amdgcn_raw_buffer_store_fp16x2(half2_t vdata,
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16");
Chao Liu's avatar
Chao Liu committed
177

178
__device__ void
179
llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata,
180
181
182
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
183
184
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16");
// float
185
__device__ void
186
187
188
189
190
llvm_amdgcn_raw_buffer_store_fp32(float vdata,
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32");
191
192

__device__ void
193
llvm_amdgcn_raw_buffer_store_fp32x2(float2_t vdata,
Chao Liu's avatar
Chao Liu committed
194
195
196
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
197
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32");
Chao Liu's avatar
Chao Liu committed
198
199

__device__ void
200
201
202
203
204
llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata,
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
zjing14's avatar
zjing14 committed
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
// atomic add
// int
__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
    int32_t vdata,
    int32x4_t rsrc,
    index_t voffset,
    index_t soffset,
    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32");

// float
__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(
    float vdata,
    int32x4_t rsrc,
    index_t voffset,
    index_t soffset,
    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32");
Chao Liu's avatar
Chao Liu committed
221
222

template <typename T, index_t N>
223
224
225
__device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
                                                                 index_t src_thread_addr_offset,
                                                                 index_t src_wave_addr_offset)
Chao Liu's avatar
Chao Liu committed
226
{
227
228
229
230
231
232
    static_assert(
        (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)),
        "wrong! not implemented");
Chao Liu's avatar
Chao Liu committed
233
234
235
236
237

    if constexpr(is_same<T, float>::value)
    {
        if constexpr(N == 1)
        {
238
            return llvm_amdgcn_raw_buffer_load_fp32(
Chao Liu's avatar
Chao Liu committed
239
240
241
242
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 2)
        {
243
            return llvm_amdgcn_raw_buffer_load_fp32x2(
Chao Liu's avatar
Chao Liu committed
244
245
246
247
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 4)
        {
248
            return llvm_amdgcn_raw_buffer_load_fp32x4(
Chao Liu's avatar
Chao Liu committed
249
250
251
252
253
254
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 8)
        {
            vector_type<float, 8> tmp;

255
            tmp.AsType<float4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_fp32x4(
Chao Liu's avatar
Chao Liu committed
256
257
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

258
            tmp.AsType<float4_t>()(Number<1>{}) =
259
260
261
262
                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
                                                   src_thread_addr_offset,
                                                   src_wave_addr_offset + 4 * sizeof(float),
                                                   0);
263

264
            return tmp.AsType<float8_t>()(Number<0>{});
265
266
267
268
269
270
        }
    }
    else if constexpr(is_same<T, half_t>::value)
    {
        if constexpr(N == 1)
        {
271
            return llvm_amdgcn_raw_buffer_load_fp16(
272
273
274
275
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 2)
        {
276
            return llvm_amdgcn_raw_buffer_load_fp16x2(
277
278
279
280
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 4)
        {
281
            return llvm_amdgcn_raw_buffer_load_fp16x4(
282
283
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
284
        else if constexpr(N == 8)
285
        {
zjing14's avatar
zjing14 committed
286
#if 0
287
288
            vector_type<half_t, 8> tmp;

289
            tmp.AsType<half4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_fp16x4(
290
291
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

292
            tmp.AsType<half4_t>()(Number<1>{}) =
293
                llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource,
294
295
296
                                                     src_thread_addr_offset,
                                                     src_wave_addr_offset + 4 * sizeof(half_t),
                                                     0);
Chao Liu's avatar
Chao Liu committed
297

298
            return tmp.AsType<half8_t>()(Number<0>{});
zjing14's avatar
zjing14 committed
299
#else
300
            float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
zjing14's avatar
zjing14 committed
301
302
303
304
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<half8_t>(tmp);
#endif
Chao Liu's avatar
Chao Liu committed
305
306
307
308
309
310
        }
    }
    else if constexpr(is_same<T, int32_t>::value)
    {
        if constexpr(N == 1)
        {
311
            return llvm_amdgcn_raw_buffer_load_i32(
Chao Liu's avatar
Chao Liu committed
312
313
314
315
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 2)
        {
316
            return llvm_amdgcn_raw_buffer_load_i32x2(
Chao Liu's avatar
Chao Liu committed
317
318
319
320
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 4)
        {
321
            return llvm_amdgcn_raw_buffer_load_i32x4(
Chao Liu's avatar
Chao Liu committed
322
323
324
325
326
327
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 8)
        {
            vector_type<int32_t, 8> tmp;

328
            tmp.AsType<int32x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i32x4(
Chao Liu's avatar
Chao Liu committed
329
330
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

331
            tmp.AsType<int32x4_t>()(Number<1>{}) =
332
333
334
335
                llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
                                                  src_thread_addr_offset,
                                                  src_wave_addr_offset + 4 * sizeof(int32_t),
                                                  0);
336
            return tmp.AsType<int32x8_t>()(Number<0>{});
Chao Liu's avatar
Chao Liu committed
337
338
        }
    }
339
    else if constexpr(is_same<T, int8_t>::value)
340
341
342
    {
        if constexpr(N == 1)
        {
343
            return llvm_amdgcn_raw_buffer_load_i8(
344
345
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
346
        else if constexpr(N == 2)
347
        {
348
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
349
            return llvm_amdgcn_raw_buffer_load_i8x2(
350
351
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
#else
352
            int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(
353
354
355
356
357
358
359
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<int8x2_t>(tmp);
#endif
        }
        else if constexpr(N == 4)
        {
360
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
361
            return llvm_amdgcn_raw_buffer_load_i8x4(
362
363
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
#else
364
            int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(
365
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
366
367
368
369
370
371

            return as_type<int8x4_t>(tmp);
#endif
        }
        else if constexpr(N == 8)
        {
372
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
373
374
            vector_type<int8_t, 8> tmp;

375
            tmp.AsType<int8x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4(
376
377
378
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            tmp.AsType<int8x4_t>()(Number<1>{}) =
379
380
381
382
                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset + 4 * sizeof(int8_t),
                                                 0);
383
384
385

            return tmp.AsType<int8x8_t>()(Number<0>{});
#else
386
            int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(
387
388
389
390
391
392
393
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<int8x8_t>(tmp);
#endif
        }
        else if constexpr(N == 16)
        {
394
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
395
396
            vector_type<int8_t, 16> tmp;

397
            tmp.AsType<int8x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4(
398
399
400
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            tmp.AsType<int8x4_t>()(Number<1>{}) =
401
402
403
404
                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset + 4 * sizeof(int8_t),
                                                 0);
405
406

            tmp.AsType<int8x4_t>()(Number<2>{}) =
407
408
409
410
                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset + 8 * sizeof(int8_t),
                                                 0);
411
412

            tmp.AsType<int8x4_t>()(Number<3>{}) =
413
414
415
416
                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset + 12 * sizeof(int8_t),
                                                 0);
417
418
419

            return tmp.AsType<int8x16_t>()(Number<0>{});
#else
420
            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(
421
422
423
424
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<int8x16_t>(tmp);
#endif
425
426
        }
    }
Chao Liu's avatar
Chao Liu committed
427
428
429
}

template <typename T, index_t N>
430
431
432
433
__device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src_thread_data,
                                      int32x4_t dst_wave_buffer_resource,
                                      index_t dst_thread_addr_offset,
                                      index_t dst_wave_addr_offset)
Chao Liu's avatar
Chao Liu committed
434
{
435
436
437
438
439
440
    static_assert(
        (is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)) ||
            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)),
        "wrong! not implemented");
Chao Liu's avatar
Chao Liu committed
441
442
443
444
445

    if constexpr(is_same<T, float>::value)
    {
        if constexpr(N == 1)
        {
446
447
448
449
450
451
452
453
454
            llvm_amdgcn_raw_buffer_store_fp32(src_thread_data,
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              0);
        }
        else if constexpr(N == 2)
        {
            llvm_amdgcn_raw_buffer_store_fp32x2(src_thread_data,
Chao Liu's avatar
Chao Liu committed
455
456
457
458
459
460
461
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
        }
        else if constexpr(N == 4)
        {
462
463
464
465
466
            llvm_amdgcn_raw_buffer_store_fp32x4(src_thread_data,
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
Chao Liu's avatar
Chao Liu committed
467
468
469
470
471
472
        }
    }
    else if constexpr(is_same<T, int32_t>::value)
    {
        if constexpr(N == 1)
        {
473
474
475
476
477
478
479
480
481
            llvm_amdgcn_raw_buffer_store_i32(src_thread_data,
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
                                             0);
        }
        else if constexpr(N == 2)
        {
            llvm_amdgcn_raw_buffer_store_i32x2(src_thread_data,
Chao Liu's avatar
Chao Liu committed
482
483
484
485
486
487
488
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               0);
        }
        else if constexpr(N == 4)
        {
489
490
491
492
493
            llvm_amdgcn_raw_buffer_store_i32x4(src_thread_data,
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               0);
Chao Liu's avatar
Chao Liu committed
494
495
496
497
498
499
        }
    }
    else if constexpr(is_same<T, int8_t>::value)
    {
        if constexpr(N == 1)
        {
500
501
502
503
504
            llvm_amdgcn_raw_buffer_store_i8(src_thread_data,
                                            dst_wave_buffer_resource,
                                            dst_thread_addr_offset,
                                            dst_wave_addr_offset,
                                            0);
Chao Liu's avatar
Chao Liu committed
505
506
507
        }
        else if constexpr(N == 2)
        {
508
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
509
510
511
512
513
            llvm_amdgcn_raw_buffer_store_i8x2(src_thread_data,
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              0);
514
#else
515
516
517
518
519
            llvm_amdgcn_raw_buffer_store_i16(as_type<int16_t>(src_thread_data),
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
                                             0);
520
#endif
Chao Liu's avatar
Chao Liu committed
521
522
523
        }
        else if constexpr(N == 4)
        {
524
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
525
526
527
528
529
            llvm_amdgcn_raw_buffer_store_i8x4(src_thread_data,
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              0);
530
#else
531
532
533
534
535
            llvm_amdgcn_raw_buffer_store_i32(as_type<int32_t>(src_thread_data),
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
                                             0);
536
#endif
Chao Liu's avatar
Chao Liu committed
537
        }
538
539
        else if constexpr(N == 8)
        {
540
541
542
543
544
            llvm_amdgcn_raw_buffer_store_i32x2(as_type<int32x2_t>(src_thread_data),
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               0);
545
546
547
        }
        else if constexpr(N == 16)
        {
548
549
550
551
552
            llvm_amdgcn_raw_buffer_store_i32x4(as_type<int32x4_t>(src_thread_data),
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               0);
553
554
555
556
557
558
        }
    }
    else if constexpr(is_same<T, half_t>::value)
    {
        if constexpr(N == 1)
        {
559
560
561
562
563
564
565
566
567
            llvm_amdgcn_raw_buffer_store_fp16(src_thread_data,
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              0);
        }
        else if constexpr(N == 2)
        {
            llvm_amdgcn_raw_buffer_store_fp16x2(src_thread_data,
568
569
570
571
572
573
574
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
        }
        else if constexpr(N == 4)
        {
575
576
577
578
579
            llvm_amdgcn_raw_buffer_store_fp16x4(src_thread_data,
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
580
581
582
583
584
        }
        else if constexpr(N == 8)
        {
            vector_type<half_t, 8> tmp{src_thread_data};

585
586
587
588
589
            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
590

591
592
593
594
595
            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<1>{}],
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset + 4 * sizeof(half_t),
                                                0);
596
        }
Chao Liu's avatar
Chao Liu committed
597
598
599
    }
}

zjing14's avatar
zjing14 committed
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
template <typename T, index_t N>
__device__ void amd_buffer_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
                                           int32x4_t dst_wave_buffer_resource,
                                           index_t dst_thread_addr_offset,
                                           index_t dst_wave_addr_offset)
{
    static_assert((is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
                      (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
                  "wrong! not implemented");

    if constexpr(is_same<T, float>::value)
    {
        if constexpr(N == 1)
        {
            llvm_amdgcn_raw_buffer_atomic_add_fp32(src_thread_data,
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset,
                                                   0);
        }
        else if constexpr(N == 2)
        {
            vector_type<float, 2> tmp{src_thread_data};

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<0>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset,
                                                   0);

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<1>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset + sizeof(float),
                                                   0);
        }
        else if constexpr(N == 4)
        {
            vector_type<float, 4> tmp{src_thread_data};

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<0>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset,
                                                   0);

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<1>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset + sizeof(float),
                                                   0);

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<2>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset + 2 * sizeof(float),
                                                   0);

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<3>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset + 3 * sizeof(float),
                                                   0);
        }
    }
    else if constexpr(is_same<T, int32_t>::value)
    {
        if constexpr(N == 1)
        {
            llvm_amdgcn_raw_buffer_atomic_add_i32(src_thread_data,
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset,
                                                  0);
        }
        else if constexpr(N == 2)
        {
            vector_type<int32_t, 2> tmp{src_thread_data};

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<0>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset,
                                                  0);

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<1>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset + sizeof(int32_t),
                                                  0);
        }
        else if constexpr(N == 4)
        {
            vector_type<int32_t, 4> tmp{src_thread_data};

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<0>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset,
                                                  0);

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<1>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset + sizeof(int32_t),
                                                  0);

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<2>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset + 2 * sizeof(int32_t),
                                                  0);

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<3>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset + 3 * sizeof(int32_t),
                                                  0);
        }
    }
}

Chao Liu's avatar
Chao Liu committed
722
723
// buffer_load requires:
//   1) p_src_wave must be in global memory space
724
//   2) p_src_wave must be a wavewise pointer.
Chao Liu's avatar
Chao Liu committed
725
726
// It is user's responsibility to make sure that is true.
template <typename T, index_t N>
727
__device__ typename vector_type_maker<T, N>::type::type
728
729
730
731
amd_buffer_load_invalid_element_return_return_zero(const T* p_src_wave,
                                                   index_t src_thread_element_offset,
                                                   bool src_thread_element_valid,
                                                   index_t src_element_space_size)
Chao Liu's avatar
Chao Liu committed
732
733
{
    const int32x4_t src_wave_buffer_resource =
734
        make_wave_buffer_resource(p_src_wave, src_element_space_size);
Chao Liu's avatar
Chao Liu committed
735

736
737
738
739
    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);

    using vector_t = typename vector_type_maker<T, N>::type::type;
    using scalar_t = typename scalar_type<vector_t>::type;
Chao Liu's avatar
Chao Liu committed
740

741
742
    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

Chao Liu's avatar
Chao Liu committed
743
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
744
    uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
745

746
    return amd_buffer_load_impl<scalar_t, vector_size>(
Chao Liu's avatar
Chao Liu committed
747
748
        src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
#else
749
    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size>(
750
        src_wave_buffer_resource, src_thread_addr_offset, 0);
Chao Liu's avatar
Chao Liu committed
751

752
    return src_thread_element_valid ? tmp : vector_t(0);
Chao Liu's avatar
Chao Liu committed
753
754
755
#endif
}

756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
// buffer_load requires:
//   1) p_src_wave must be in global memory space
//   2) p_src_wave must be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t N>
__device__ typename vector_type_maker<T, N>::type::type
amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
                                                        index_t src_thread_element_offset,
                                                        bool src_thread_element_valid,
                                                        index_t src_element_space_size,
                                                        T customized_value)
{
    const int32x4_t src_wave_buffer_resource =
        make_wave_buffer_resource(p_src_wave, src_element_space_size);

    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);

    using vector_t = typename vector_type_maker<T, N>::type::type;
    using scalar_t = typename scalar_type<vector_t>::type;

    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size>(
        src_wave_buffer_resource, src_thread_addr_offset, 0);

    return src_thread_element_valid ? tmp : vector_t(customized_value);
}

Chao Liu's avatar
Chao Liu committed
784
785
// buffer_store requires:
//   1) p_dst_wave must be global memory
zjing14's avatar
zjing14 committed
786
//   2) p_dst_wave must be a wavewise pointer.
Chao Liu's avatar
Chao Liu committed
787
788
// It is user's responsibility to make sure that is true.
template <typename T, index_t N>
789
790
791
792
793
__device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::type src_thread_data,
                                 T* p_dst_wave,
                                 const index_t dst_thread_element_offset,
                                 const bool dst_thread_element_valid,
                                 const index_t dst_element_space_size)
Chao Liu's avatar
Chao Liu committed
794
795
{
    const int32x4_t dst_wave_buffer_resource =
796
        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
Chao Liu's avatar
Chao Liu committed
797

798
    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
Chao Liu's avatar
Chao Liu committed
799

800
801
802
803
    using vector_t                = typename vector_type_maker<T, N>::type::type;
    using scalar_t                = typename scalar_type<vector_t>::type;
    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

Chao Liu's avatar
Chao Liu committed
804
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
805
    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
806

807
    amd_buffer_store_impl<scalar_t, vector_size>(
Chao Liu's avatar
Chao Liu committed
808
809
        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
#else
810
    if(dst_thread_element_valid)
Chao Liu's avatar
Chao Liu committed
811
    {
812
        amd_buffer_store_impl<scalar_t, vector_size>(
Chao Liu's avatar
Chao Liu committed
813
814
815
816
817
            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
    }
#endif
}

zjing14's avatar
zjing14 committed
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
// buffer_atomic_add requires:
//   1) p_dst_wave must be global memory
//   2) p_dst_wave must be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t N>
__device__ void
amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thread_data,
                      T* p_dst_wave,
                      const index_t dst_thread_element_offset,
                      const bool dst_thread_element_valid,
                      const index_t dst_element_space_size)
{
    const int32x4_t dst_wave_buffer_resource =
        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);

    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);

    using vector_t                = typename vector_type_maker<T, N>::type::type;
    using scalar_t                = typename scalar_type<vector_t>::type;
    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x7fffffff;

    amd_buffer_atomic_add_impl<scalar_t, vector_size>(
        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
#else
    if(dst_thread_element_valid)
    {
        amd_buffer_atomic_add_impl<scalar_t, vector_size>(
            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
    }
#endif
}

Chao Liu's avatar
Chao Liu committed
853
854
} // namespace ck
#endif