amd_buffer_addressing.hpp 27.4 KB
Newer Older
Chao Liu's avatar
tidy  
Chao Liu committed
1
2
#ifndef CK_AMD_BUFFER_ADDRESSING_HPP
#define CK_AMD_BUFFER_ADDRESSING_HPP
Chao Liu's avatar
Chao Liu committed
3

4
#include "data_type.hpp"
Chao Liu's avatar
Chao Liu committed
5
6
7
8

namespace ck {

template <typename T>
Chao Liu's avatar
tidy  
Chao Liu committed
9
union BufferResource
Chao Liu's avatar
Chao Liu committed
10
11
12
13
{
    // 128 bit SGPRs to supply buffer resource in buffer instructions
    // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
    int32x4_t data;
14
15
16
    StaticallyIndexedArray<T*, 2> address;
    StaticallyIndexedArray<int32_t, 4> range;
    StaticallyIndexedArray<int32_t, 4> config;
Chao Liu's avatar
Chao Liu committed
17
18
19
20
21
};

template <typename T>
__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t data_space_size)
{
Chao Liu's avatar
tidy  
Chao Liu committed
22
    BufferResource<T> wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
23
24

    // wavewise base address (64 bit)
25
    wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
Chao Liu's avatar
Chao Liu committed
26
    // wavewise range (32 bit)
27
    wave_buffer_resource.range(Number<2>{}) = data_space_size * sizeof(T);
Chao Liu's avatar
Chao Liu committed
28
    // wavewise setting (32 bit)
29
    wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
30
31
32
33
34
35

    return wave_buffer_resource.data;
}

// load
__device__ int8_t
36
37
38
39
llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
                               index_t voffset,
                               index_t soffset,
                               index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8");
40
41

__device__ int8x2_t
42
43
44
45
llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8");
46
47

__device__ int8x4_t
48
49
50
51
llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8");
52

Chao Liu's avatar
Chao Liu committed
53
__device__ int16_t
54
55
56
57
llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc,
                                index_t voffset,
                                index_t soffset,
                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
Chao Liu's avatar
Chao Liu committed
58
__device__ int32_t
59
60
61
62
llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc,
                                index_t voffset,
                                index_t soffset,
                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
Chao Liu's avatar
Chao Liu committed
63
64

__device__ int32x2_t
65
66
67
68
llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
Chao Liu's avatar
Chao Liu committed
69
70

__device__ int32x4_t
71
72
73
74
llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
75
76
// half
__device__ half_t
77
78
79
80
llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");
81
82

__device__ half2_t
83
84
85
86
llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16");
87
88

__device__ half4_t
89
90
91
92
llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16");
Chao Liu's avatar
Chao Liu committed
93

94
// float
Chao Liu's avatar
Chao Liu committed
95
__device__ float
96
97
98
99
llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32");
Chao Liu's avatar
Chao Liu committed
100
101

__device__ float2_t
102
103
104
105
llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32");
Chao Liu's avatar
Chao Liu committed
106
107

__device__ float4_t
108
109
110
111
llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32");
Chao Liu's avatar
Chao Liu committed
112
113
114

// store
__device__ void
115
116
117
118
119
120
121
122
llvm_amdgcn_raw_buffer_store_i8(int8_t vdata,
                                int32x4_t rsrc,
                                index_t voffset,
                                index_t soffset,
                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8");

__device__ void
llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata,
Chao Liu's avatar
Chao Liu committed
123
124
125
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
126
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8");
Chao Liu's avatar
Chao Liu committed
127

128
__device__ void
129
130
131
132
133
llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata,
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8");
134
135

__device__ void
136
137
138
139
140
141
142
143
144
145
146
147
llvm_amdgcn_raw_buffer_store_i16(int16_t vdata,
                                 int32x4_t rsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");

__device__ void
llvm_amdgcn_raw_buffer_store_i32(int32_t vdata,
                                 int32x4_t rsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32");
148

Chao Liu's avatar
Chao Liu committed
149
__device__ void
150
llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata,
Chao Liu's avatar
Chao Liu committed
151
152
153
                                   int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
154
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32");
Chao Liu's avatar
Chao Liu committed
155
156

__device__ void
157
llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata,
Chao Liu's avatar
Chao Liu committed
158
159
160
                                   int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
161
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
Chao Liu's avatar
Chao Liu committed
162

163
// half
Chao Liu's avatar
Chao Liu committed
164
__device__ void
165
166
167
168
169
llvm_amdgcn_raw_buffer_store_fp16(half_t vdata,
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
Chao Liu's avatar
Chao Liu committed
170
171

__device__ void
172
173
174
175
176
llvm_amdgcn_raw_buffer_store_fp16x2(half2_t vdata,
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16");
Chao Liu's avatar
Chao Liu committed
177

178
__device__ void
179
llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata,
180
181
182
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
183
184
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16");
// float
185
__device__ void
186
187
188
189
190
llvm_amdgcn_raw_buffer_store_fp32(float vdata,
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32");
191
192

__device__ void
193
llvm_amdgcn_raw_buffer_store_fp32x2(float2_t vdata,
Chao Liu's avatar
Chao Liu committed
194
195
196
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
197
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32");
Chao Liu's avatar
Chao Liu committed
198
199

__device__ void
200
201
202
203
204
llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata,
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
Chao Liu's avatar
Chao Liu committed
205
206
207
208
209
210
211

template <typename T, index_t N>
__device__ typename vector_type<T, N>::type
amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
                        index_t src_thread_addr_offset,
                        index_t src_wave_addr_offset)
{
212
213
214
215
216
217
    static_assert(
        (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)),
        "wrong! not implemented");
Chao Liu's avatar
Chao Liu committed
218
219
220
221
222

    if constexpr(is_same<T, float>::value)
    {
        if constexpr(N == 1)
        {
223
            return llvm_amdgcn_raw_buffer_load_fp32(
Chao Liu's avatar
Chao Liu committed
224
225
226
227
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 2)
        {
228
            return llvm_amdgcn_raw_buffer_load_fp32x2(
Chao Liu's avatar
Chao Liu committed
229
230
231
232
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 4)
        {
233
            return llvm_amdgcn_raw_buffer_load_fp32x4(
Chao Liu's avatar
Chao Liu committed
234
235
236
237
238
239
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 8)
        {
            vector_type<float, 8> tmp;

240
            tmp.AsType<float4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_fp32x4(
Chao Liu's avatar
Chao Liu committed
241
242
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

243
            tmp.AsType<float4_t>()(Number<1>{}) =
244
245
246
247
                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
                                                   src_thread_addr_offset,
                                                   src_wave_addr_offset + 4 * sizeof(float),
                                                   0);
248

249
            return tmp.AsType<float8_t>()(Number<0>{});
250
251
252
253
254
255
        }
    }
    else if constexpr(is_same<T, half_t>::value)
    {
        if constexpr(N == 1)
        {
256
            return llvm_amdgcn_raw_buffer_load_fp16(
257
258
259
260
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 2)
        {
261
            return llvm_amdgcn_raw_buffer_load_fp16x2(
262
263
264
265
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 4)
        {
266
            return llvm_amdgcn_raw_buffer_load_fp16x4(
267
268
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
269
        else if constexpr(N == 8)
270
        {
zjing14's avatar
zjing14 committed
271
#if 0
272
273
            vector_type<half_t, 8> tmp;

274
            tmp.AsType<half4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_fp16x4(
275
276
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

277
            tmp.AsType<half4_t>()(Number<1>{}) =
278
                llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource,
279
280
281
                                                     src_thread_addr_offset,
                                                     src_wave_addr_offset + 4 * sizeof(half_t),
                                                     0);
Chao Liu's avatar
Chao Liu committed
282

283
            return tmp.AsType<half8_t>()(Number<0>{});
zjing14's avatar
zjing14 committed
284
#else
285
            float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
zjing14's avatar
zjing14 committed
286
287
288
289
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<half8_t>(tmp);
#endif
Chao Liu's avatar
Chao Liu committed
290
291
292
293
294
295
        }
    }
    else if constexpr(is_same<T, int32_t>::value)
    {
        if constexpr(N == 1)
        {
296
            return llvm_amdgcn_raw_buffer_load_i32(
Chao Liu's avatar
Chao Liu committed
297
298
299
300
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 2)
        {
301
            return llvm_amdgcn_raw_buffer_load_i32x2(
Chao Liu's avatar
Chao Liu committed
302
303
304
305
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 4)
        {
306
            return llvm_amdgcn_raw_buffer_load_i32x4(
Chao Liu's avatar
Chao Liu committed
307
308
309
310
311
312
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
        else if constexpr(N == 8)
        {
            vector_type<int32_t, 8> tmp;

313
            tmp.AsType<int32x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i32x4(
Chao Liu's avatar
Chao Liu committed
314
315
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

316
            tmp.AsType<int32x4_t>()(Number<1>{}) =
317
318
319
320
                llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
                                                  src_thread_addr_offset,
                                                  src_wave_addr_offset + 4 * sizeof(int32_t),
                                                  0);
321
            return tmp.AsType<int32x8_t>()(Number<0>{});
Chao Liu's avatar
Chao Liu committed
322
323
        }
    }
324
    else if constexpr(is_same<T, int8_t>::value)
325
326
327
    {
        if constexpr(N == 1)
        {
328
            return llvm_amdgcn_raw_buffer_load_i8(
329
330
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
        }
331
        else if constexpr(N == 2)
332
        {
333
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
334
            return llvm_amdgcn_raw_buffer_load_i8x2(
335
336
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
#else
337
            int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(
338
339
340
341
342
343
344
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<int8x2_t>(tmp);
#endif
        }
        else if constexpr(N == 4)
        {
345
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
346
            return llvm_amdgcn_raw_buffer_load_i8x4(
347
348
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
#else
349
            int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(
350
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
351
352
353
354
355
356

            return as_type<int8x4_t>(tmp);
#endif
        }
        else if constexpr(N == 8)
        {
357
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
358
359
            vector_type<int8_t, 8> tmp;

360
            tmp.AsType<int8x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4(
361
362
363
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            tmp.AsType<int8x4_t>()(Number<1>{}) =
364
365
366
367
                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset + 4 * sizeof(int8_t),
                                                 0);
368
369
370

            return tmp.AsType<int8x8_t>()(Number<0>{});
#else
371
            int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(
372
373
374
375
376
377
378
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<int8x8_t>(tmp);
#endif
        }
        else if constexpr(N == 16)
        {
379
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
380
381
            vector_type<int8_t, 16> tmp;

382
            tmp.AsType<int8x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4(
383
384
385
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            tmp.AsType<int8x4_t>()(Number<1>{}) =
386
387
388
389
                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset + 4 * sizeof(int8_t),
                                                 0);
390
391

            tmp.AsType<int8x4_t>()(Number<2>{}) =
392
393
394
395
                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset + 8 * sizeof(int8_t),
                                                 0);
396
397

            tmp.AsType<int8x4_t>()(Number<3>{}) =
398
399
400
401
                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset + 12 * sizeof(int8_t),
                                                 0);
402
403
404

            return tmp.AsType<int8x16_t>()(Number<0>{});
#else
405
            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(
406
407
408
409
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);

            return as_type<int8x16_t>(tmp);
#endif
410
411
        }
    }
Chao Liu's avatar
Chao Liu committed
412
413
414
415
416
417
418
419
}

template <typename T, index_t N>
__device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type src_thread_data,
                                         int32x4_t dst_wave_buffer_resource,
                                         index_t dst_thread_addr_offset,
                                         index_t dst_wave_addr_offset)
{
420
421
422
423
424
425
    static_assert(
        (is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)) ||
            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)),
        "wrong! not implemented");
Chao Liu's avatar
Chao Liu committed
426
427
428
429
430

    if constexpr(is_same<T, float>::value)
    {
        if constexpr(N == 1)
        {
431
432
433
434
435
436
437
438
439
            llvm_amdgcn_raw_buffer_store_fp32(src_thread_data,
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              0);
        }
        else if constexpr(N == 2)
        {
            llvm_amdgcn_raw_buffer_store_fp32x2(src_thread_data,
Chao Liu's avatar
Chao Liu committed
440
441
442
443
444
445
446
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
        }
        else if constexpr(N == 4)
        {
447
448
449
450
451
            llvm_amdgcn_raw_buffer_store_fp32x4(src_thread_data,
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
Chao Liu's avatar
Chao Liu committed
452
453
454
455
456
457
        }
    }
    else if constexpr(is_same<T, int32_t>::value)
    {
        if constexpr(N == 1)
        {
458
459
460
461
462
463
464
465
466
            llvm_amdgcn_raw_buffer_store_i32(src_thread_data,
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
                                             0);
        }
        else if constexpr(N == 2)
        {
            llvm_amdgcn_raw_buffer_store_i32x2(src_thread_data,
Chao Liu's avatar
Chao Liu committed
467
468
469
470
471
472
473
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               0);
        }
        else if constexpr(N == 4)
        {
474
475
476
477
478
            llvm_amdgcn_raw_buffer_store_i32x4(src_thread_data,
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               0);
Chao Liu's avatar
Chao Liu committed
479
480
481
482
483
484
        }
    }
    else if constexpr(is_same<T, int8_t>::value)
    {
        if constexpr(N == 1)
        {
485
486
487
488
489
            llvm_amdgcn_raw_buffer_store_i8(src_thread_data,
                                            dst_wave_buffer_resource,
                                            dst_thread_addr_offset,
                                            dst_wave_addr_offset,
                                            0);
Chao Liu's avatar
Chao Liu committed
490
491
492
        }
        else if constexpr(N == 2)
        {
493
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
494
495
496
497
498
            llvm_amdgcn_raw_buffer_store_i8x2(src_thread_data,
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              0);
499
#else
500
501
502
503
504
            llvm_amdgcn_raw_buffer_store_i16(as_type<int16_t>(src_thread_data),
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
                                             0);
505
#endif
Chao Liu's avatar
Chao Liu committed
506
507
508
        }
        else if constexpr(N == 4)
        {
509
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
510
511
512
513
514
            llvm_amdgcn_raw_buffer_store_i8x4(src_thread_data,
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              0);
515
#else
516
517
518
519
520
            llvm_amdgcn_raw_buffer_store_i32(as_type<int32_t>(src_thread_data),
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
                                             0);
521
#endif
Chao Liu's avatar
Chao Liu committed
522
        }
523
524
        else if constexpr(N == 8)
        {
525
526
527
528
529
            llvm_amdgcn_raw_buffer_store_i32x2(as_type<int32x2_t>(src_thread_data),
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               0);
530
531
532
        }
        else if constexpr(N == 16)
        {
533
534
535
536
537
            llvm_amdgcn_raw_buffer_store_i32x4(as_type<int32x4_t>(src_thread_data),
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               0);
538
539
540
541
542
543
        }
    }
    else if constexpr(is_same<T, half_t>::value)
    {
        if constexpr(N == 1)
        {
544
545
546
547
548
549
550
551
552
            llvm_amdgcn_raw_buffer_store_fp16(src_thread_data,
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              0);
        }
        else if constexpr(N == 2)
        {
            llvm_amdgcn_raw_buffer_store_fp16x2(src_thread_data,
553
554
555
556
557
558
559
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
        }
        else if constexpr(N == 4)
        {
560
561
562
563
564
            llvm_amdgcn_raw_buffer_store_fp16x4(src_thread_data,
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
565
566
567
568
569
        }
        else if constexpr(N == 8)
        {
            vector_type<half_t, 8> tmp{src_thread_data};

570
571
572
573
574
            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                0);
575

576
577
578
579
580
            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<1>{}],
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset + 4 * sizeof(half_t),
                                                0);
581
        }
Chao Liu's avatar
Chao Liu committed
582
583
584
585
586
587
588
589
    }
}

// buffer_load requires:
//   1) p_src_wave must be in global memory space
//   2) p_src_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t N>
590
591
592
593
594
__device__ typename vector_type_maker<T, N>::type::type
amd_buffer_load_v2(const T* p_src_wave,
                   index_t src_thread_data_offset,
                   bool src_thread_data_valid,
                   index_t src_element_space)
Chao Liu's avatar
Chao Liu committed
595
596
597
598
599
600
{
    const int32x4_t src_wave_buffer_resource =
        make_wave_buffer_resource(p_src_wave, src_element_space);

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(T);

601
602
603
604
    using vector_t                = typename vector_type_maker<T, N>::type::type;
    using scalar_t                = typename scalar_type<vector_t>::type;
    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

Chao Liu's avatar
Chao Liu committed
605
606
607
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

608
    return amd_buffer_load_impl_v2<scalar_t, vector_size>(
Chao Liu's avatar
Chao Liu committed
609
610
        src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
#else
611
612
    vector_t tmp = amd_buffer_load_impl_v2<scalar_t, vector_size>(
        src_wave_buffer_resource, src_thread_addr_offset, 0);
Chao Liu's avatar
Chao Liu committed
613
614
615
616
617
618
619
620
621
622

    return src_thread_data_valid ? tmp : vector_t(0);
#endif
}

// buffer_store requires:
//   1) p_dst_wave must be global memory
//   2) p_dst_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t N>
623
624
625
626
627
628
__device__ void
amd_buffer_store_v2(const typename vector_type_maker<T, N>::type::type src_thread_data,
                    T* p_dst_wave,
                    const index_t dst_thread_data_offset,
                    const bool dst_thread_data_valid,
                    const index_t dst_element_space)
Chao Liu's avatar
Chao Liu committed
629
630
631
632
633
634
{
    const int32x4_t dst_wave_buffer_resource =
        make_wave_buffer_resource(p_dst_wave, dst_element_space);

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(T);

635
636
637
638
    using vector_t                = typename vector_type_maker<T, N>::type::type;
    using scalar_t                = typename scalar_type<vector_t>::type;
    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

Chao Liu's avatar
Chao Liu committed
639
640
641
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

642
    amd_buffer_store_impl_v2<scalar_t, vector_size>(
Chao Liu's avatar
Chao Liu committed
643
644
645
646
        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
#else
    if(dst_thread_data_valid)
    {
647
        amd_buffer_store_impl_v2<scalar_t, vector_size>(
Chao Liu's avatar
Chao Liu committed
648
649
650
651
652
653
654
            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
    }
#endif
}

} // namespace ck
#endif