amd_buffer_addressing.hpp 28.9 KB
Newer Older
Chao Liu's avatar
tidy  
Chao Liu committed
1
2
#ifndef CK_AMD_BUFFER_ADDRESSING_HPP
#define CK_AMD_BUFFER_ADDRESSING_HPP
Chao Liu's avatar
Chao Liu committed
3

4
#include "data_type.hpp"
Chao Liu's avatar
Chao Liu committed
5
6
7
8

namespace ck {

template <typename T>
Chao Liu's avatar
tidy  
Chao Liu committed
9
union BufferResource
Chao Liu's avatar
Chao Liu committed
10
11
12
{
    // 128 bit SGPRs to supply buffer resource in buffer instructions
    // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
13
    int32x4_t content;
14
15
16
    StaticallyIndexedArray<T*, 2> address;
    StaticallyIndexedArray<int32_t, 4> range;
    StaticallyIndexedArray<int32_t, 4> config;
Chao Liu's avatar
Chao Liu committed
17
18
19
};

template <typename T>
20
__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_size)
Chao Liu's avatar
Chao Liu committed
21
{
Chao Liu's avatar
tidy  
Chao Liu committed
22
    BufferResource<T> wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
23
24

    // wavewise base address (64 bit)
25
    wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
Chao Liu's avatar
Chao Liu committed
26
    // wavewise range (32 bit)
27
    wave_buffer_resource.range(Number<2>{}) = element_space_size * sizeof(T);
Chao Liu's avatar
Chao Liu committed
28
    // wavewise setting (32 bit)
29
    wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
30

31
    return wave_buffer_resource.content;
Chao Liu's avatar
Chao Liu committed
32
33
34
35
}

// load
__device__ int8_t
36
37
38
39
llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
                               index_t voffset,
                               index_t soffset,
                               index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8");
40
41

__device__ int8x2_t
42
43
44
45
llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8");
46
47

__device__ int8x4_t
48
49
50
51
llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8");
52

Chao Liu's avatar
Chao Liu committed
53
__device__ int16_t
54
55
56
57
llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc,
                                index_t voffset,
                                index_t soffset,
                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
Chao Liu's avatar
Chao Liu committed
58
__device__ int32_t
59
60
61
62
llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc,
                                index_t voffset,
                                index_t soffset,
                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
Chao Liu's avatar
Chao Liu committed
63
64

__device__ int32x2_t
65
66
67
68
llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
Chao Liu's avatar
Chao Liu committed
69
70

__device__ int32x4_t
71
72
73
74
llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
75
76
// half
__device__ half_t
77
78
79
80
llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");
81
82

__device__ half2_t
83
84
85
86
llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16");
87
88

__device__ half4_t
89
90
91
92
llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16");
Chao Liu's avatar
Chao Liu committed
93

94
// float
Chao Liu's avatar
Chao Liu committed
95
__device__ float
96
97
98
99
llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32");
Chao Liu's avatar
Chao Liu committed
100
101

__device__ float2_t
102
103
104
105
llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32");
Chao Liu's avatar
Chao Liu committed
106
107

__device__ float4_t
108
109
110
111
llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32");
Chao Liu's avatar
Chao Liu committed
112
113
114

// store
__device__ void
115
116
117
118
119
120
121
122
llvm_amdgcn_raw_buffer_store_i8(int8_t vdata,
                                int32x4_t rsrc,
                                index_t voffset,
                                index_t soffset,
                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8");

__device__ void
llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata,
Chao Liu's avatar
Chao Liu committed
123
124
125
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
126
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8");
Chao Liu's avatar
Chao Liu committed
127

128
__device__ void
129
130
131
132
133
llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata,
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8");
134
135

__device__ void
136
137
138
139
140
141
142
143
144
145
146
147
llvm_amdgcn_raw_buffer_store_i16(int16_t vdata,
                                 int32x4_t rsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");

__device__ void
llvm_amdgcn_raw_buffer_store_i32(int32_t vdata,
                                 int32x4_t rsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32");
148

Chao Liu's avatar
Chao Liu committed
149
__device__ void
150
llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata,
Chao Liu's avatar
Chao Liu committed
151
152
153
                                   int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
154
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32");
Chao Liu's avatar
Chao Liu committed
155
156

__device__ void
157
llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata,
Chao Liu's avatar
Chao Liu committed
158
159
160
                                   int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
161
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
Chao Liu's avatar
Chao Liu committed
162

163
// half
Chao Liu's avatar
Chao Liu committed
164
__device__ void
165
166
167
168
169
llvm_amdgcn_raw_buffer_store_fp16(half_t vdata,
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
Chao Liu's avatar
Chao Liu committed
170
171

__device__ void
172
173
174
175
176
llvm_amdgcn_raw_buffer_store_fp16x2(half2_t vdata,
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16");
Chao Liu's avatar
Chao Liu committed
177

178
__device__ void
179
llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata,
180
181
182
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
183
184
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16");
// float
185
__device__ void
186
187
188
189
190
llvm_amdgcn_raw_buffer_store_fp32(float vdata,
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32");
191
192

__device__ void
193
llvm_amdgcn_raw_buffer_store_fp32x2(float2_t vdata,
Chao Liu's avatar
Chao Liu committed
194
195
196
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
197
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32");
Chao Liu's avatar
Chao Liu committed
198
199

__device__ void
200
201
202
203
204
llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata,
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
Chao Liu's avatar
Chao Liu committed
205
206

template <typename T, index_t N>
207
208
209
__device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
                                                                 index_t src_thread_addr_offset,
                                                                 index_t src_wave_addr_offset)
Chao Liu's avatar
Chao Liu committed
210
{
211
212
213
214
215
216
    static_assert(
        (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)),
        "wrong! not implemented");
Chao Liu's avatar
Chao Liu committed
217
218
219
220
221

    if constexpr(is_same<T, float>::value)
    {
        if constexpr(N == 1)
        {
222
            return llvm_amdgcn_raw_buffer_load_fp32(
Chao Liu's avatar
Chao Liu committed
223
224
225
226
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 2)
        {
227
            return llvm_amdgcn_raw_buffer_load_fp32x2(
Chao Liu's avatar
Chao Liu committed
228
229
230
231
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 4)
        {
232
            return llvm_amdgcn_raw_buffer_load_fp32x4(
Chao Liu's avatar
Chao Liu committed
233
234
235
236
237
238
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 8)
        {
            vector_type<float, 8> tmp;

239
            tmp.AsType<float4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_fp32x4(
Chao Liu's avatar
Chao Liu committed
240
241
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

242
            tmp.AsType<float4_t>()(Number<1>{}) =
243
244
245
246
                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
                                                   src_thread_addr_offset,
                                                   src_wave_addr_offset + 4 * sizeof(float),
                                                   0);
247

248
            return tmp.AsType<float8_t>()(Number<0>{});
249
250
251
252
253
254
        }
    }
    else if constexpr(is_same<T, half_t>::value)
    {
        if constexpr(N == 1)
        {
255
            return llvm_amdgcn_raw_buffer_load_fp16(
256
257
258
259
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 2)
        {
260
            return llvm_amdgcn_raw_buffer_load_fp16x2(
261
262
263
264
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 4)
        {
265
            return llvm_amdgcn_raw_buffer_load_fp16x4(
266
267
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
268
        else if constexpr(N == 8)
269
        {
zjing14's avatar
zjing14 committed
270
#if 0
271
272
            vector_type<half_t, 8> tmp;

273
            tmp.AsType<half4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_fp16x4(
274
275
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

276
            tmp.AsType<half4_t>()(Number<1>{}) =
277
                llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource,
278
279
280
                                                     src_thread_addr_offset,
                                                     src_wave_addr_offset + 4 * sizeof(half_t),
                                                     0);
Chao Liu's avatar
Chao Liu committed
281

282
            return tmp.AsType<half8_t>()(Number<0>{});
zjing14's avatar
zjing14 committed
283
#else
284
            float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
zjing14's avatar
zjing14 committed
285
286
287
288
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<half8_t>(tmp);
#endif
Chao Liu's avatar
Chao Liu committed
289
290
291
292
293
294
        }
    }
    else if constexpr(is_same<T, int32_t>::value)
    {
        if constexpr(N == 1)
        {
295
            return llvm_amdgcn_raw_buffer_load_i32(
Chao Liu's avatar
Chao Liu committed
296
297
298
299
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 2)
        {
300
            return llvm_amdgcn_raw_buffer_load_i32x2(
Chao Liu's avatar
Chao Liu committed
301
302
303
304
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 4)
        {
305
            return llvm_amdgcn_raw_buffer_load_i32x4(
Chao Liu's avatar
Chao Liu committed
306
307
308
309
310
311
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 8)
        {
            vector_type<int32_t, 8> tmp;

312
            tmp.AsType<int32x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i32x4(
Chao Liu's avatar
Chao Liu committed
313
314
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

315
            tmp.AsType<int32x4_t>()(Number<1>{}) =
316
317
318
319
                llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
                                                  src_thread_addr_offset,
                                                  src_wave_addr_offset + 4 * sizeof(int32_t),
                                                  0);
320
            return tmp.AsType<int32x8_t>()(Number<0>{});
Chao Liu's avatar
Chao Liu committed
321
322
        }
    }
323
    else if constexpr(is_same<T, int8_t>::value)
324
325
326
    {
        if constexpr(N == 1)
        {
327
            return llvm_amdgcn_raw_buffer_load_i8(
328
329
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
330
        else if constexpr(N == 2)
331
        {
332
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
333
            return llvm_amdgcn_raw_buffer_load_i8x2(
334
335
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
#else
336
            int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(
337
338
339
340
341
342
343
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<int8x2_t>(tmp);
#endif
        }
        else if constexpr(N == 4)
        {
344
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
345
            return llvm_amdgcn_raw_buffer_load_i8x4(
346
347
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
#else
348
            int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(
349
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
350
351
352
353
354
355

            return as_type<int8x4_t>(tmp);
#endif
        }
        else if constexpr(N == 8)
        {
356
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
357
358
            vector_type<int8_t, 8> tmp;

359
            tmp.AsType<int8x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4(
360
361
362
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            tmp.AsType<int8x4_t>()(Number<1>{}) =
363
364
365
366
                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset + 4 * sizeof(int8_t),
                                                 0);
367
368
369

            return tmp.AsType<int8x8_t>()(Number<0>{});
#else
370
            int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(
371
372
373
374
375
376
377
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<int8x8_t>(tmp);
#endif
        }
        else if constexpr(N == 16)
        {
378
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
379
380
            vector_type<int8_t, 16> tmp;

381
            tmp.AsType<int8x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4(
382
383
384
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            tmp.AsType<int8x4_t>()(Number<1>{}) =
385
386
387
388
                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset + 4 * sizeof(int8_t),
                                                 0);
389
390

            tmp.AsType<int8x4_t>()(Number<2>{}) =
391
392
393
394
                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset + 8 * sizeof(int8_t),
                                                 0);
395
396

            tmp.AsType<int8x4_t>()(Number<3>{}) =
397
398
399
400
                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset + 12 * sizeof(int8_t),
                                                 0);
401
402
403

            return tmp.AsType<int8x16_t>()(Number<0>{});
#else
404
            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(
405
406
407
408
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<int8x16_t>(tmp);
#endif
409
410
        }
    }
Chao Liu's avatar
Chao Liu committed
411
412
413
}

template <typename T, index_t N>
414
415
416
417
__device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src_thread_data,
                                      int32x4_t dst_wave_buffer_resource,
                                      index_t dst_thread_addr_offset,
                                      index_t dst_wave_addr_offset)
Chao Liu's avatar
Chao Liu committed
418
{
419
420
421
422
423
424
    static_assert(
        (is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)) ||
            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)),
        "wrong! not implemented");
Chao Liu's avatar
Chao Liu committed
425
426
427
428
429

    if constexpr(is_same<T, float>::value)
    {
        if constexpr(N == 1)
        {
430
431
432
433
434
435
436
437
438
            llvm_amdgcn_raw_buffer_store_fp32(src_thread_data,
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              0);
        }
        else if constexpr(N == 2)
        {
            llvm_amdgcn_raw_buffer_store_fp32x2(src_thread_data,
Chao Liu's avatar
Chao Liu committed
439
440
441
442
443
444
445
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
        }
        else if constexpr(N == 4)
        {
446
447
448
449
450
            llvm_amdgcn_raw_buffer_store_fp32x4(src_thread_data,
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
Chao Liu's avatar
Chao Liu committed
451
452
453
454
455
456
        }
    }
    else if constexpr(is_same<T, int32_t>::value)
    {
        if constexpr(N == 1)
        {
457
458
459
460
461
462
463
464
465
            llvm_amdgcn_raw_buffer_store_i32(src_thread_data,
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
                                             0);
        }
        else if constexpr(N == 2)
        {
            llvm_amdgcn_raw_buffer_store_i32x2(src_thread_data,
Chao Liu's avatar
Chao Liu committed
466
467
468
469
470
471
472
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               0);
        }
        else if constexpr(N == 4)
        {
473
474
475
476
477
            llvm_amdgcn_raw_buffer_store_i32x4(src_thread_data,
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               0);
Chao Liu's avatar
Chao Liu committed
478
479
480
481
482
483
        }
    }
    else if constexpr(is_same<T, int8_t>::value)
    {
        if constexpr(N == 1)
        {
484
485
486
487
488
            llvm_amdgcn_raw_buffer_store_i8(src_thread_data,
                                            dst_wave_buffer_resource,
                                            dst_thread_addr_offset,
                                            dst_wave_addr_offset,
                                            0);
Chao Liu's avatar
Chao Liu committed
489
490
491
        }
        else if constexpr(N == 2)
        {
492
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
493
494
495
496
497
            llvm_amdgcn_raw_buffer_store_i8x2(src_thread_data,
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              0);
498
#else
499
500
501
502
503
            llvm_amdgcn_raw_buffer_store_i16(as_type<int16_t>(src_thread_data),
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
                                             0);
504
#endif
Chao Liu's avatar
Chao Liu committed
505
506
507
        }
        else if constexpr(N == 4)
        {
508
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
509
510
511
512
513
            llvm_amdgcn_raw_buffer_store_i8x4(src_thread_data,
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              0);
514
#else
515
516
517
518
519
            llvm_amdgcn_raw_buffer_store_i32(as_type<int32_t>(src_thread_data),
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
                                             0);
520
#endif
Chao Liu's avatar
Chao Liu committed
521
        }
522
523
        else if constexpr(N == 8)
        {
524
525
526
527
528
            llvm_amdgcn_raw_buffer_store_i32x2(as_type<int32x2_t>(src_thread_data),
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               0);
529
530
531
        }
        else if constexpr(N == 16)
        {
532
533
534
535
536
            llvm_amdgcn_raw_buffer_store_i32x4(as_type<int32x4_t>(src_thread_data),
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               0);
537
538
539
540
541
542
        }
    }
    else if constexpr(is_same<T, half_t>::value)
    {
        if constexpr(N == 1)
        {
543
544
545
546
547
548
549
550
551
            llvm_amdgcn_raw_buffer_store_fp16(src_thread_data,
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              0);
        }
        else if constexpr(N == 2)
        {
            llvm_amdgcn_raw_buffer_store_fp16x2(src_thread_data,
552
553
554
555
556
557
558
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
        }
        else if constexpr(N == 4)
        {
559
560
561
562
563
            llvm_amdgcn_raw_buffer_store_fp16x4(src_thread_data,
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
564
565
566
567
568
        }
        else if constexpr(N == 8)
        {
            vector_type<half_t, 8> tmp{src_thread_data};

569
570
571
572
573
            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
574

575
576
577
578
579
            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<1>{}],
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset + 4 * sizeof(half_t),
                                                0);
580
        }
Chao Liu's avatar
Chao Liu committed
581
582
583
584
585
    }
}

// buffer_load requires:
//   1) p_src_wave must be in global memory space
586
//   2) p_src_wave must be a wavewise pointer.
Chao Liu's avatar
Chao Liu committed
587
588
// It is user's responsibility to make sure that is true.
template <typename T, index_t N>
589
__device__ typename vector_type_maker<T, N>::type::type
590
591
592
593
amd_buffer_load_invalid_element_return_return_zero(const T* p_src_wave,
                                                   index_t src_thread_element_offset,
                                                   bool src_thread_element_valid,
                                                   index_t src_element_space_size)
Chao Liu's avatar
Chao Liu committed
594
595
{
    const int32x4_t src_wave_buffer_resource =
596
        make_wave_buffer_resource(p_src_wave, src_element_space_size);
Chao Liu's avatar
Chao Liu committed
597

598
599
600
601
    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);

    using vector_t = typename vector_type_maker<T, N>::type::type;
    using scalar_t = typename scalar_type<vector_t>::type;
Chao Liu's avatar
Chao Liu committed
602

603
604
    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

Chao Liu's avatar
Chao Liu committed
605
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
606
    uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
607

608
    return amd_buffer_load_impl<scalar_t, vector_size>(
Chao Liu's avatar
Chao Liu committed
609
610
        src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
#else
611
    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size>(
612
        src_wave_buffer_resource, src_thread_addr_offset, 0);
Chao Liu's avatar
Chao Liu committed
613

614
    return src_thread_element_valid ? tmp : vector_t(0);
Chao Liu's avatar
Chao Liu committed
615
616
617
#endif
}

618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
// buffer_load requires:
//   1) p_src_wave must be in global memory space
//   2) p_src_wave must be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t N>
__device__ typename vector_type_maker<T, N>::type::type
amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
                                                        index_t src_thread_element_offset,
                                                        bool src_thread_element_valid,
                                                        index_t src_element_space_size,
                                                        T customized_value)
{
    const int32x4_t src_wave_buffer_resource =
        make_wave_buffer_resource(p_src_wave, src_element_space_size);

    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);

    using vector_t = typename vector_type_maker<T, N>::type::type;
    using scalar_t = typename scalar_type<vector_t>::type;

    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size>(
        src_wave_buffer_resource, src_thread_addr_offset, 0);

    return src_thread_element_valid ? tmp : vector_t(customized_value);
}

Chao Liu's avatar
Chao Liu committed
646
647
648
649
650
// buffer_store requires:
//   1) p_dst_wave must be global memory
//   2) p_dst_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t N>
651
652
653
654
655
__device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::type src_thread_data,
                                 T* p_dst_wave,
                                 const index_t dst_thread_element_offset,
                                 const bool dst_thread_element_valid,
                                 const index_t dst_element_space_size)
Chao Liu's avatar
Chao Liu committed
656
657
{
    const int32x4_t dst_wave_buffer_resource =
658
        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
Chao Liu's avatar
Chao Liu committed
659

660
    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
Chao Liu's avatar
Chao Liu committed
661

662
663
664
665
    using vector_t                = typename vector_type_maker<T, N>::type::type;
    using scalar_t                = typename scalar_type<vector_t>::type;
    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

Chao Liu's avatar
Chao Liu committed
666
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
667
    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
668

669
    amd_buffer_store_impl<scalar_t, vector_size>(
Chao Liu's avatar
Chao Liu committed
670
671
        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
#else
672
    if(dst_thread_element_valid)
Chao Liu's avatar
Chao Liu committed
673
    {
674
        amd_buffer_store_impl<scalar_t, vector_size>(
Chao Liu's avatar
Chao Liu committed
675
676
677
678
679
680
681
            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
    }
#endif
}

} // namespace ck
#endif