amd_buffer_addressing.hpp 38.9 KB
Newer Older
Chao Liu's avatar
tidy  
Chao Liu committed
1
2
#ifndef CK_AMD_BUFFER_ADDRESSING_HPP
#define CK_AMD_BUFFER_ADDRESSING_HPP
Chao Liu's avatar
Chao Liu committed
3

4
#include "data_type.hpp"
Chao Liu's avatar
Chao Liu committed
5
6
7
8

namespace ck {

template <typename T>
Chao Liu's avatar
tidy  
Chao Liu committed
9
union BufferResource
Chao Liu's avatar
Chao Liu committed
10
11
12
{
    // 128 bit SGPRs to supply buffer resource in buffer instructions
    // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
13
    int32x4_t content;
14
15
16
    StaticallyIndexedArray<T*, 2> address;
    StaticallyIndexedArray<int32_t, 4> range;
    StaticallyIndexedArray<int32_t, 4> config;
Chao Liu's avatar
Chao Liu committed
17
18
19
};

template <typename T>
20
__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_size)
Chao Liu's avatar
Chao Liu committed
21
{
Chao Liu's avatar
tidy  
Chao Liu committed
22
    BufferResource<T> wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
23
24

    // wavewise base address (64 bit)
25
    wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
Chao Liu's avatar
Chao Liu committed
26
    // wavewise range (32 bit)
27
    wave_buffer_resource.range(Number<2>{}) = element_space_size * sizeof(T);
Chao Liu's avatar
Chao Liu committed
28
    // wavewise setting (32 bit)
29
    wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
30

31
    return wave_buffer_resource.content;
Chao Liu's avatar
Chao Liu committed
32
33
34
35
}

// load
__device__ int8_t
36
37
38
39
llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
                               index_t voffset,
                               index_t soffset,
                               index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8");
40
41

__device__ int8x2_t
42
43
44
45
llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8");
46
47

__device__ int8x4_t
48
49
50
51
llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8");
52

Chao Liu's avatar
Chao Liu committed
53
__device__ int16_t
54
55
56
57
llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc,
                                index_t voffset,
                                index_t soffset,
                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
Chao Liu's avatar
Chao Liu committed
58
__device__ int32_t
59
60
61
62
llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc,
                                index_t voffset,
                                index_t soffset,
                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
Chao Liu's avatar
Chao Liu committed
63
64

__device__ int32x2_t
65
66
67
68
llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
Chao Liu's avatar
Chao Liu committed
69
70

__device__ int32x4_t
71
72
73
74
llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
75
76
// half
__device__ half_t
77
78
79
80
llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");
81
82

__device__ half2_t
83
84
85
86
llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16");
87
88

__device__ half4_t
89
90
91
92
llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16");
Chao Liu's avatar
Chao Liu committed
93

94
// float
Chao Liu's avatar
Chao Liu committed
95
__device__ float
96
97
98
99
llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32");
Chao Liu's avatar
Chao Liu committed
100
101

__device__ float2_t
102
103
104
105
llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32");
Chao Liu's avatar
Chao Liu committed
106
107

__device__ float4_t
108
109
110
111
llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32");
Chao Liu's avatar
Chao Liu committed
112
113
114

// store
__device__ void
115
116
117
118
119
120
121
122
llvm_amdgcn_raw_buffer_store_i8(int8_t vdata,
                                int32x4_t rsrc,
                                index_t voffset,
                                index_t soffset,
                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8");

__device__ void
llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata,
Chao Liu's avatar
Chao Liu committed
123
124
125
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
126
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8");
Chao Liu's avatar
Chao Liu committed
127

128
__device__ void
129
130
131
132
133
llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata,
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8");
134
135

__device__ void
136
137
138
139
140
141
142
143
144
145
146
147
llvm_amdgcn_raw_buffer_store_i16(int16_t vdata,
                                 int32x4_t rsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");

__device__ void
llvm_amdgcn_raw_buffer_store_i32(int32_t vdata,
                                 int32x4_t rsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32");
148

Chao Liu's avatar
Chao Liu committed
149
__device__ void
150
llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata,
Chao Liu's avatar
Chao Liu committed
151
152
153
                                   int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
154
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32");
Chao Liu's avatar
Chao Liu committed
155
156

__device__ void
157
llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata,
Chao Liu's avatar
Chao Liu committed
158
159
160
                                   int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
161
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
Chao Liu's avatar
Chao Liu committed
162

163
// half
Chao Liu's avatar
Chao Liu committed
164
__device__ void
165
166
167
168
169
llvm_amdgcn_raw_buffer_store_fp16(half_t vdata,
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
Chao Liu's avatar
Chao Liu committed
170
171

__device__ void
172
173
174
175
176
llvm_amdgcn_raw_buffer_store_fp16x2(half2_t vdata,
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16");
Chao Liu's avatar
Chao Liu committed
177

178
__device__ void
179
llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata,
180
181
182
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
183
184
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16");
// float
185
__device__ void
186
187
188
189
190
llvm_amdgcn_raw_buffer_store_fp32(float vdata,
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32");
191
192

__device__ void
193
llvm_amdgcn_raw_buffer_store_fp32x2(float2_t vdata,
Chao Liu's avatar
Chao Liu committed
194
195
196
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
197
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32");
Chao Liu's avatar
Chao Liu committed
198
199

__device__ void
200
201
202
203
204
llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata,
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
zjing14's avatar
zjing14 committed
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
// atomic add
// int
__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
    int32_t vdata,
    int32x4_t rsrc,
    index_t voffset,
    index_t soffset,
    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32");

// float
__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(
    float vdata,
    int32x4_t rsrc,
    index_t voffset,
    index_t soffset,
    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32");
Chao Liu's avatar
Chao Liu committed
221
222

template <typename T, index_t N>
223
224
225
__device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
                                                                 index_t src_thread_addr_offset,
                                                                 index_t src_wave_addr_offset)
Chao Liu's avatar
Chao Liu committed
226
{
227
    static_assert(
Chao Liu's avatar
Chao Liu committed
228
229
        (is_same<T, double>::value && (N == 1 || N == 2 || N == 4)) ||
            (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
230
            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
Chao Liu's avatar
Chao Liu committed
231
232
            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
233
        "wrong! not implemented");
Chao Liu's avatar
Chao Liu committed
234

Chao Liu's avatar
Chao Liu committed
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
    if constexpr(is_same<T, double>::value)
    {
        // use fp32 load to mimic fp64 load
        if constexpr(N == 1)
        {
            const float2_t tmp = llvm_amdgcn_raw_buffer_load_fp32x2(
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<double>(tmp);
        }
        else if constexpr(N == 2)
        {
            const float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<double2_t>(tmp);
        }
        else if constexpr(N == 4)
        {
            const float4_t f32_0 = llvm_amdgcn_raw_buffer_load_fp32x4(
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            const float4_t f32_1 =
                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
                                                   src_thread_addr_offset,
                                                   src_wave_addr_offset + 4 * sizeof(float),
                                                   0);
            vector_type<double, 4> tmp;

            tmp.AsType<double2_t>()(Number<0>{}) = as_type<double2_t>(f32_0);
            tmp.AsType<double2_t>()(Number<1>{}) = as_type<double2_t>(f32_1);

            return tmp.AsType<double4_t>()(Number<0>{});
        }
    }
    else if constexpr(is_same<T, float>::value)
Chao Liu's avatar
Chao Liu committed
271
272
273
    {
        if constexpr(N == 1)
        {
274
            return llvm_amdgcn_raw_buffer_load_fp32(
Chao Liu's avatar
Chao Liu committed
275
276
277
278
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 2)
        {
279
            return llvm_amdgcn_raw_buffer_load_fp32x2(
Chao Liu's avatar
Chao Liu committed
280
281
282
283
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 4)
        {
284
            return llvm_amdgcn_raw_buffer_load_fp32x4(
Chao Liu's avatar
Chao Liu committed
285
286
287
288
289
290
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 8)
        {
            vector_type<float, 8> tmp;

291
            tmp.AsType<float4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_fp32x4(
Chao Liu's avatar
Chao Liu committed
292
293
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

294
            tmp.AsType<float4_t>()(Number<1>{}) =
295
296
297
298
                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
                                                   src_thread_addr_offset,
                                                   src_wave_addr_offset + 4 * sizeof(float),
                                                   0);
299

300
            return tmp.AsType<float8_t>()(Number<0>{});
301
302
303
304
305
306
        }
    }
    else if constexpr(is_same<T, half_t>::value)
    {
        if constexpr(N == 1)
        {
307
            return llvm_amdgcn_raw_buffer_load_fp16(
308
309
310
311
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 2)
        {
312
            return llvm_amdgcn_raw_buffer_load_fp16x2(
313
314
315
316
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 4)
        {
317
            return llvm_amdgcn_raw_buffer_load_fp16x4(
318
319
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
320
        else if constexpr(N == 8)
321
        {
Chao Liu's avatar
Chao Liu committed
322
            // use fp32 load to mimic fp16 load
323
            float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
zjing14's avatar
zjing14 committed
324
325
326
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<half8_t>(tmp);
Chao Liu's avatar
Chao Liu committed
327
328
329
330
331
332
        }
    }
    else if constexpr(is_same<T, int32_t>::value)
    {
        if constexpr(N == 1)
        {
333
            return llvm_amdgcn_raw_buffer_load_i32(
Chao Liu's avatar
Chao Liu committed
334
335
336
337
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 2)
        {
338
            return llvm_amdgcn_raw_buffer_load_i32x2(
Chao Liu's avatar
Chao Liu committed
339
340
341
342
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 4)
        {
343
            return llvm_amdgcn_raw_buffer_load_i32x4(
Chao Liu's avatar
Chao Liu committed
344
345
346
347
348
349
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 8)
        {
            vector_type<int32_t, 8> tmp;

350
            tmp.AsType<int32x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i32x4(
Chao Liu's avatar
Chao Liu committed
351
352
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

353
            tmp.AsType<int32x4_t>()(Number<1>{}) =
354
355
356
357
                llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
                                                  src_thread_addr_offset,
                                                  src_wave_addr_offset + 4 * sizeof(int32_t),
                                                  0);
358
            return tmp.AsType<int32x8_t>()(Number<0>{});
Chao Liu's avatar
Chao Liu committed
359
360
        }
    }
361
    else if constexpr(is_same<T, int8_t>::value)
362
363
364
    {
        if constexpr(N == 1)
        {
365
            return llvm_amdgcn_raw_buffer_load_i8(
366
367
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
368
        else if constexpr(N == 2)
369
        {
370
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
371
            return llvm_amdgcn_raw_buffer_load_i8x2(
372
373
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
#else
374
            int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(
375
376
377
378
379
380
381
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<int8x2_t>(tmp);
#endif
        }
        else if constexpr(N == 4)
        {
382
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
383
            return llvm_amdgcn_raw_buffer_load_i8x4(
384
385
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
#else
386
            int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(
387
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
388
389
390
391
392
393

            return as_type<int8x4_t>(tmp);
#endif
        }
        else if constexpr(N == 8)
        {
394
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
395
396
            vector_type<int8_t, 8> tmp;

397
            tmp.AsType<int8x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4(
398
399
400
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            tmp.AsType<int8x4_t>()(Number<1>{}) =
401
402
403
404
                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset + 4 * sizeof(int8_t),
                                                 0);
405
406
407

            return tmp.AsType<int8x8_t>()(Number<0>{});
#else
408
            int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(
409
410
411
412
413
414
415
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<int8x8_t>(tmp);
#endif
        }
        else if constexpr(N == 16)
        {
416
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
417
418
            vector_type<int8_t, 16> tmp;

419
            tmp.AsType<int8x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4(
420
421
422
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            tmp.AsType<int8x4_t>()(Number<1>{}) =
423
424
425
426
                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset + 4 * sizeof(int8_t),
                                                 0);
427
428

            tmp.AsType<int8x4_t>()(Number<2>{}) =
429
430
431
432
                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset + 8 * sizeof(int8_t),
                                                 0);
433
434

            tmp.AsType<int8x4_t>()(Number<3>{}) =
435
436
437
438
                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset + 12 * sizeof(int8_t),
                                                 0);
439
440
441

            return tmp.AsType<int8x16_t>()(Number<0>{});
#else
442
            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(
443
444
445
446
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<int8x16_t>(tmp);
#endif
447
448
        }
    }
Chao Liu's avatar
Chao Liu committed
449
450
451
}

template <typename T, index_t N>
452
453
454
455
__device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src_thread_data,
                                      int32x4_t dst_wave_buffer_resource,
                                      index_t dst_thread_addr_offset,
                                      index_t dst_wave_addr_offset)
Chao Liu's avatar
Chao Liu committed
456
{
457
    static_assert(
Chao Liu's avatar
Chao Liu committed
458
459
460
        (is_same<T, double>::value && (N == 1 || N == 2)) ||
            (is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
461
            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)) ||
Chao Liu's avatar
Chao Liu committed
462
            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
463
        "wrong! not implemented");
Chao Liu's avatar
Chao Liu committed
464

Chao Liu's avatar
Chao Liu committed
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
    if constexpr(is_same<T, double>::value)
    {
        // use fp32 store to mimic fp64 store
        if constexpr(N == 1)
        {
            llvm_amdgcn_raw_buffer_store_fp32x2(as_type<float2_t>(src_thread_data),
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
        }
        else if constexpr(N == 2)
        {
            llvm_amdgcn_raw_buffer_store_fp32x4(as_type<float4_t>(src_thread_data),
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
        }
    }
    else if constexpr(is_same<T, float>::value)
Chao Liu's avatar
Chao Liu committed
486
487
488
    {
        if constexpr(N == 1)
        {
489
490
491
492
493
494
495
496
497
            llvm_amdgcn_raw_buffer_store_fp32(src_thread_data,
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              0);
        }
        else if constexpr(N == 2)
        {
            llvm_amdgcn_raw_buffer_store_fp32x2(src_thread_data,
Chao Liu's avatar
Chao Liu committed
498
499
500
501
502
503
504
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
        }
        else if constexpr(N == 4)
        {
505
506
507
508
509
            llvm_amdgcn_raw_buffer_store_fp32x4(src_thread_data,
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
Chao Liu's avatar
Chao Liu committed
510
511
        }
    }
Chao Liu's avatar
Chao Liu committed
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
    else if constexpr(is_same<T, half_t>::value)
    {
        if constexpr(N == 1)
        {
            llvm_amdgcn_raw_buffer_store_fp16(src_thread_data,
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              0);
        }
        else if constexpr(N == 2)
        {
            llvm_amdgcn_raw_buffer_store_fp16x2(src_thread_data,
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
        }
        else if constexpr(N == 4)
        {
            llvm_amdgcn_raw_buffer_store_fp16x4(src_thread_data,
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
        }
        else if constexpr(N == 8)
        {
            vector_type<half_t, 8> tmp{src_thread_data};

            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);

            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<1>{}],
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset + 4 * sizeof(half_t),
                                                0);
        }
    }
Chao Liu's avatar
Chao Liu committed
555
556
557
558
    else if constexpr(is_same<T, int32_t>::value)
    {
        if constexpr(N == 1)
        {
559
560
561
562
563
564
565
566
567
            llvm_amdgcn_raw_buffer_store_i32(src_thread_data,
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
                                             0);
        }
        else if constexpr(N == 2)
        {
            llvm_amdgcn_raw_buffer_store_i32x2(src_thread_data,
Chao Liu's avatar
Chao Liu committed
568
569
570
571
572
573
574
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               0);
        }
        else if constexpr(N == 4)
        {
575
576
577
578
579
            llvm_amdgcn_raw_buffer_store_i32x4(src_thread_data,
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               0);
Chao Liu's avatar
Chao Liu committed
580
581
582
583
584
585
        }
    }
    else if constexpr(is_same<T, int8_t>::value)
    {
        if constexpr(N == 1)
        {
586
587
588
589
590
            llvm_amdgcn_raw_buffer_store_i8(src_thread_data,
                                            dst_wave_buffer_resource,
                                            dst_thread_addr_offset,
                                            dst_wave_addr_offset,
                                            0);
Chao Liu's avatar
Chao Liu committed
591
592
593
        }
        else if constexpr(N == 2)
        {
594
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
595
596
597
598
599
            llvm_amdgcn_raw_buffer_store_i8x2(src_thread_data,
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              0);
600
#else
601
602
603
604
605
            llvm_amdgcn_raw_buffer_store_i16(as_type<int16_t>(src_thread_data),
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
                                             0);
606
#endif
Chao Liu's avatar
Chao Liu committed
607
608
609
        }
        else if constexpr(N == 4)
        {
610
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
611
612
613
614
615
            llvm_amdgcn_raw_buffer_store_i8x4(src_thread_data,
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              0);
616
#else
617
618
619
620
621
            llvm_amdgcn_raw_buffer_store_i32(as_type<int32_t>(src_thread_data),
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
                                             0);
622
#endif
Chao Liu's avatar
Chao Liu committed
623
        }
624
625
        else if constexpr(N == 8)
        {
626
627
628
629
630
            llvm_amdgcn_raw_buffer_store_i32x2(as_type<int32x2_t>(src_thread_data),
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               0);
631
632
633
        }
        else if constexpr(N == 16)
        {
634
635
636
637
638
            llvm_amdgcn_raw_buffer_store_i32x4(as_type<int32x4_t>(src_thread_data),
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               0);
639
640
        }
    }
Chao Liu's avatar
Chao Liu committed
641
642
}

zjing14's avatar
zjing14 committed
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
template <typename T, index_t N>
__device__ void amd_buffer_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
                                           int32x4_t dst_wave_buffer_resource,
                                           index_t dst_thread_addr_offset,
                                           index_t dst_wave_addr_offset)
{
    static_assert((is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
                      (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
                  "wrong! not implemented");

    if constexpr(is_same<T, float>::value)
    {
        if constexpr(N == 1)
        {
            llvm_amdgcn_raw_buffer_atomic_add_fp32(src_thread_data,
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset,
                                                   0);
        }
        else if constexpr(N == 2)
        {
            vector_type<float, 2> tmp{src_thread_data};

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<0>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset,
                                                   0);

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<1>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset + sizeof(float),
                                                   0);
        }
        else if constexpr(N == 4)
        {
            vector_type<float, 4> tmp{src_thread_data};

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<0>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset,
                                                   0);

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<1>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset + sizeof(float),
                                                   0);

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<2>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset + 2 * sizeof(float),
                                                   0);

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<3>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset + 3 * sizeof(float),
                                                   0);
        }
    }
    else if constexpr(is_same<T, int32_t>::value)
    {
        if constexpr(N == 1)
        {
            llvm_amdgcn_raw_buffer_atomic_add_i32(src_thread_data,
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset,
                                                  0);
        }
        else if constexpr(N == 2)
        {
            vector_type<int32_t, 2> tmp{src_thread_data};

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<0>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset,
                                                  0);

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<1>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset + sizeof(int32_t),
                                                  0);
        }
        else if constexpr(N == 4)
        {
            vector_type<int32_t, 4> tmp{src_thread_data};

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<0>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset,
                                                  0);

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<1>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset + sizeof(int32_t),
                                                  0);

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<2>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset + 2 * sizeof(int32_t),
                                                  0);

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<3>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset + 3 * sizeof(int32_t),
                                                  0);
        }
    }
}

Chao Liu's avatar
Chao Liu committed
765
// buffer_load requires:
Chao Liu's avatar
Chao Liu committed
766
//   1) p_src_wave must point to global memory space
767
//   2) p_src_wave must be a wavewise pointer.
Chao Liu's avatar
Chao Liu committed
768
769
// It is user's responsibility to make sure that is true.
template <typename T, index_t N>
770
__device__ typename vector_type_maker<T, N>::type::type
771
772
773
774
amd_buffer_load_invalid_element_return_return_zero(const T* p_src_wave,
                                                   index_t src_thread_element_offset,
                                                   bool src_thread_element_valid,
                                                   index_t src_element_space_size)
Chao Liu's avatar
Chao Liu committed
775
776
{
    const int32x4_t src_wave_buffer_resource =
777
        make_wave_buffer_resource(p_src_wave, src_element_space_size);
Chao Liu's avatar
Chao Liu committed
778

779
780
781
782
    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);

    using vector_t = typename vector_type_maker<T, N>::type::type;
    using scalar_t = typename scalar_type<vector_t>::type;
Chao Liu's avatar
Chao Liu committed
783

784
785
    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

Chao Liu's avatar
Chao Liu committed
786
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
787
    uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
788

789
    return amd_buffer_load_impl<scalar_t, vector_size>(
Chao Liu's avatar
Chao Liu committed
790
791
        src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
#else
792
    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size>(
793
        src_wave_buffer_resource, src_thread_addr_offset, 0);
Chao Liu's avatar
Chao Liu committed
794

795
    return src_thread_element_valid ? tmp : vector_t(0);
Chao Liu's avatar
Chao Liu committed
796
797
798
#endif
}

799
// buffer_load requires:
Chao Liu's avatar
Chao Liu committed
800
//   1) p_src_wave must point to global memory space
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
//   2) p_src_wave must be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t N>
__device__ typename vector_type_maker<T, N>::type::type
amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
                                                        index_t src_thread_element_offset,
                                                        bool src_thread_element_valid,
                                                        index_t src_element_space_size,
                                                        T customized_value)
{
    const int32x4_t src_wave_buffer_resource =
        make_wave_buffer_resource(p_src_wave, src_element_space_size);

    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);

    using vector_t = typename vector_type_maker<T, N>::type::type;
    using scalar_t = typename scalar_type<vector_t>::type;

    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size>(
        src_wave_buffer_resource, src_thread_addr_offset, 0);

    return src_thread_element_valid ? tmp : vector_t(customized_value);
}

Chao Liu's avatar
Chao Liu committed
827
// buffer_store requires:
Chao Liu's avatar
Chao Liu committed
828
//   1) p_dst_wave must point to global memory
zjing14's avatar
zjing14 committed
829
//   2) p_dst_wave must be a wavewise pointer.
Chao Liu's avatar
Chao Liu committed
830
831
// It is user's responsibility to make sure that is true.
template <typename T, index_t N>
832
833
834
835
836
__device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::type src_thread_data,
                                 T* p_dst_wave,
                                 const index_t dst_thread_element_offset,
                                 const bool dst_thread_element_valid,
                                 const index_t dst_element_space_size)
Chao Liu's avatar
Chao Liu committed
837
838
{
    const int32x4_t dst_wave_buffer_resource =
839
        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
Chao Liu's avatar
Chao Liu committed
840

841
    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
Chao Liu's avatar
Chao Liu committed
842

843
844
845
846
    using vector_t                = typename vector_type_maker<T, N>::type::type;
    using scalar_t                = typename scalar_type<vector_t>::type;
    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

Chao Liu's avatar
Chao Liu committed
847
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
848
    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
849

850
    amd_buffer_store_impl<scalar_t, vector_size>(
Chao Liu's avatar
Chao Liu committed
851
852
        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
#else
853
    if(dst_thread_element_valid)
Chao Liu's avatar
Chao Liu committed
854
    {
855
        amd_buffer_store_impl<scalar_t, vector_size>(
Chao Liu's avatar
Chao Liu committed
856
857
858
859
860
            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
    }
#endif
}

zjing14's avatar
zjing14 committed
861
// buffer_atomic_add requires:
Chao Liu's avatar
Chao Liu committed
862
//   1) p_dst_wave must point to global memory
zjing14's avatar
zjing14 committed
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
//   2) p_dst_wave must be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t N>
__device__ void
amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thread_data,
                      T* p_dst_wave,
                      const index_t dst_thread_element_offset,
                      const bool dst_thread_element_valid,
                      const index_t dst_element_space_size)
{
    const int32x4_t dst_wave_buffer_resource =
        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);

    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);

    using vector_t                = typename vector_type_maker<T, N>::type::type;
    using scalar_t                = typename scalar_type<vector_t>::type;
    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x7fffffff;

    amd_buffer_atomic_add_impl<scalar_t, vector_size>(
        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
#else
    if(dst_thread_element_valid)
    {
        amd_buffer_atomic_add_impl<scalar_t, vector_size>(
            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
    }
#endif
}

Chao Liu's avatar
Chao Liu committed
896
897
} // namespace ck
#endif