amd_buffer_addressing.hpp 47.6 KB
Newer Older
1
2
3
4
5
6
7
#ifndef CK_AMD_BUFFER_ADDRESSING_HPP
#define CK_AMD_BUFFER_ADDRESSING_HPP

#include "float_type.hpp"

namespace ck {

8
// For 128 bit SGPRs to supply resource constant in buffer instructions
9
10
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
template <typename T>
11
union BufferResourceConstant
12
13
14
15
{
    int32x4_t data;
    T* address[2];
    int32_t range[4];
16
    int32_t config[4];
17
18
};

19
__device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
20
21
22
23
                                               index_t vindex,
                                               index_t offset,
                                               bool glc,
                                               bool slc) __asm("llvm.amdgcn.buffer.load.f32");
24

Chao Liu's avatar
Chao Liu committed
25
__device__ float2_t
26
__llvm_amdgcn_buffer_load_f32x2(int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
27
28
29
30
31
32
                                index_t vindex,
                                index_t offset,
                                bool glc,
                                bool slc) __asm("llvm.amdgcn.buffer.load.v2f32");

__device__ float4_t
33
__llvm_amdgcn_buffer_load_f32x4(int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
34
35
36
37
38
                                index_t vindex,
                                index_t offset,
                                bool glc,
                                bool slc) __asm("llvm.amdgcn.buffer.load.v4f32");

39
__device__ half_t __llvm_amdgcn_buffer_load_f16(int32x4_t srsrc,
40
41
42
                                                index_t vindex,
                                                index_t offset,
                                                bool glc,
Chao Liu's avatar
Chao Liu committed
43
44
                                                bool slc) __asm("llvm.amdgcn.buffer.load.f16");

45
__device__ half2_t __llvm_amdgcn_buffer_load_f16x2(int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
46
47
48
49
50
                                                   index_t vindex,
                                                   index_t offset,
                                                   bool glc,
                                                   bool slc) __asm("llvm.amdgcn.buffer.load.v2f16");

51
__device__ half4_t __llvm_amdgcn_buffer_load_f16x4(int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
52
53
54
55
56
                                                   index_t vindex,
                                                   index_t offset,
                                                   bool glc,
                                                   bool slc) __asm("llvm.amdgcn.buffer.load.v4f16");

57
__device__ ushort __llvm_amdgcn_buffer_load_bf16(int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
58
59
60
61
62
63
                                                 index_t vindex,
                                                 index_t offset,
                                                 bool glc,
                                                 bool slc) __asm("llvm.amdgcn.buffer.load.bf16");

__device__ ushort2_t
64
__llvm_amdgcn_buffer_load_bf16x2(int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
65
66
67
68
69
70
                                 index_t vindex,
                                 index_t offset,
                                 bool glc,
                                 bool slc) __asm("llvm.amdgcn.buffer.load.v2bf16");

__device__ ushort4_t
71
__llvm_amdgcn_buffer_load_bf16x4(int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
72
73
74
75
76
77
                                 index_t vindex,
                                 index_t offset,
                                 bool glc,
                                 bool slc) __asm("llvm.amdgcn.buffer.load.v4bf16");

__device__ void __llvm_amdgcn_buffer_store_f32(float vdata,
78
                                               int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
79
80
81
82
83
84
                                               index_t vindex,
                                               index_t offset,
                                               bool glc,
                                               bool slc) __asm("llvm.amdgcn.buffer.store.f32");

__device__ void __llvm_amdgcn_buffer_store_f32x2(float2_t vdata,
85
                                                 int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
86
87
88
89
90
91
                                                 index_t vindex,
                                                 index_t offset,
                                                 bool glc,
                                                 bool slc) __asm("llvm.amdgcn.buffer.store.v2f32");

__device__ void __llvm_amdgcn_buffer_store_f32x4(float4_t vdata,
92
                                                 int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
93
94
95
96
97
98
                                                 index_t vindex,
                                                 index_t offset,
                                                 bool glc,
                                                 bool slc) __asm("llvm.amdgcn.buffer.store.v4f32");

__device__ void __llvm_amdgcn_buffer_store_f16(half_t vdata,
99
                                               int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
100
101
102
103
104
105
                                               index_t vindex,
                                               index_t offset,
                                               bool glc,
                                               bool slc) __asm("llvm.amdgcn.buffer.store.f16");

__device__ void __llvm_amdgcn_buffer_store_f16x2(half2_t vdata,
106
                                                 int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
107
108
109
110
111
112
                                                 index_t vindex,
                                                 index_t offset,
                                                 bool glc,
                                                 bool slc) __asm("llvm.amdgcn.buffer.store.v2f16");

__device__ void __llvm_amdgcn_buffer_store_f16x4(half4_t vdata,
113
                                                 int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
114
115
116
117
118
119
                                                 index_t vindex,
                                                 index_t offset,
                                                 bool glc,
                                                 bool slc) __asm("llvm.amdgcn.buffer.store.v4f16");

__device__ void __llvm_amdgcn_buffer_store_bf16(ushort vdata,
120
                                                int32x4_t srsrc,
121
122
123
                                                index_t vindex,
                                                index_t offset,
                                                bool glc,
Chao Liu's avatar
Chao Liu committed
124
                                                bool slc) __asm("llvm.amdgcn.buffer.store.bf16");
125

126
__device__ void
Chao Liu's avatar
Chao Liu committed
127
__llvm_amdgcn_buffer_store_bf16x2(ushort2_t vdata,
128
                                  int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
129
130
131
132
133
134
135
                                  index_t vindex,
                                  index_t offset,
                                  bool glc,
                                  bool slc) __asm("llvm.amdgcn.buffer.store.v2bf16");

__device__ void
__llvm_amdgcn_buffer_store_bf16x4(ushort4_t vdata,
136
                                  int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
137
138
139
140
141
142
143
                                  index_t vindex,
                                  index_t offset,
                                  bool glc,
                                  bool slc) __asm("llvm.amdgcn.buffer.store.v4bf16");

__device__ void
__llvm_amdgcn_buffer_atomic_add_f32(float vdata,
144
                                    int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
145
146
147
                                    index_t vindex,
                                    index_t offset,
                                    bool slc) __asm("llvm.amdgcn.buffer.atomic.fadd.f32");
148

Chao Liu's avatar
Chao Liu committed
149
// buffer_load requires:
150
151
//   1) p_src_thread must be in global memory space, p_dst_thread must be vgpr
//   2) p_src_thread to be a wavewise pointer.
Chao Liu's avatar
Chao Liu committed
152
// It is user's responsibility to make sure that is true.
153
template <typename T, index_t VectorSize>
154
155
156
157
158
__device__ typename vector_type<T, VectorSize>::MemoryType
amd_buffer_load(const T* p_src_wave,
                index_t src_thread_data_offset,
                bool src_thread_data_valid,
                index_t src_elemenst_space);
159

Chao Liu's avatar
Chao Liu committed
160
// buffer_store requires:
161
162
//   1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
//   2) p_dst_thread to be a wavewise pointer.
Chao Liu's avatar
Chao Liu committed
163
// It is user's responsibility to make sure that is true.
164
template <typename T, index_t VectorSize>
165
166
__device__ void amd_buffer_store(const T* p_src_thread,
                                 T* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
167
                                 index_t dst_thread_data_offset,
168
169
                                 bool dst_thread_data_valid,
                                 index_t dst_data_range);
170

171
172
173
174
// buffer_atomic requires:
//   1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
//   2) p_dst_thread to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
175
template <typename T, index_t VectorSize>
176
177
__device__ void amd_buffer_atomic_add(const T* p_src_thread,
                                      T* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
178
                                      index_t dst_thread_data_offset,
179
180
                                      bool dst_thread_data_valid,
                                      index_t dst_data_range);
181

182
template <>
183
__device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
Chao Liu's avatar
Chao Liu committed
184
                                           index_t src_thread_data_offset,
185
186
                                           bool src_thread_data_valid,
                                           index_t src_data_range)
187
{
188
    BufferResourceConstant<float> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
189

190
191
192
193
194
195
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
    // wavewise setting (32 bit)
    src_wave_buffer_resource.config[3] = 0x00027000;
196
197

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
198
199
200
201
202
203
204
205
206
207
208

#if 1 // debug
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    return __llvm_amdgcn_buffer_load_f32(src_wave_buffer_resource.data,
                                         0,
                                         src_thread_data_valid ? src_thread_addr_offset
                                                               : 0xffffffff,
                                         false,
                                         false);
#else
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
209

Chao Liu's avatar
Chao Liu committed
210
    return __llvm_amdgcn_buffer_load_f32(
211
212
213
214
215
216
217
218
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
#else
    return src_thread_data_valid
               ? __llvm_amdgcn_buffer_load_f32(
                     src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false)
               : 0;
#endif
Chao Liu's avatar
Chao Liu committed
219
220
221
}

template <>
222
__device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave,
Chao Liu's avatar
Chao Liu committed
223
                                              index_t src_thread_data_offset,
224
225
                                              bool src_thread_data_valid,
                                              index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
226
{
227
    BufferResourceConstant<float> src_wave_buffer_resource;
228

229
230
231
232
233
234
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
    // wavewise setting (32 bit)
    src_wave_buffer_resource.config[3] = 0x00027000;
235

Chao Liu's avatar
Chao Liu committed
236
    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
237
238
239
240
241
242
243
244
245
246

#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    return __llvm_amdgcn_buffer_load_f32x2(src_wave_buffer_resource.data,
                                           0,
                                           src_thread_data_valid ? src_thread_addr_offset
                                                                 : 0xffffffff,
                                           false,
                                           false);
#else
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
247

Chao Liu's avatar
Chao Liu committed
248
    return __llvm_amdgcn_buffer_load_f32x2(
249
250
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
251
252
253
}

template <>
254
__device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave,
Chao Liu's avatar
Chao Liu committed
255
                                              index_t src_thread_data_offset,
256
257
                                              bool src_thread_data_valid,
                                              index_t src_data_range)
258
{
259
    BufferResourceConstant<float> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
260

261
262
263
264
265
266
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
    // wavewise setting (32 bit)
    src_wave_buffer_resource.config[3] = 0x00027000;
267
268

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
269
270
271
272
273
274
275
276
277
278

#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    return __llvm_amdgcn_buffer_load_f32x4(src_wave_buffer_resource.data,
                                           0,
                                           src_thread_data_valid ? src_thread_addr_offset
                                                                 : 0xffffffff,
                                           false,
                                           false);
#else
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
279

Chao Liu's avatar
Chao Liu committed
280
    return __llvm_amdgcn_buffer_load_f32x4(
281
282
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
Chao Liu's avatar
Chao Liu committed
283
284
285
}

template <>
286
__device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
Chao Liu's avatar
Chao Liu committed
287
                                             index_t src_thread_data_offset,
288
289
                                             bool src_thread_data_valid,
                                             index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
290
{
291
    BufferResourceConstant<half_t> src_wave_buffer_resource;
292

293
294
295
296
297
298
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
    src_wave_buffer_resource.config[3] = 0x00027000;
299

Chao Liu's avatar
Chao Liu committed
300
301
#if !CK_WORKAROUND_SWDEV_231101
    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
302
303
304
305
306
307
308
309
310
311

#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    return __llvm_amdgcn_buffer_load_f16(src_wave_buffer_resource.data,
                                         0,
                                         src_thread_data_valid ? src_thread_addr_offset
                                                               : 0xffffffff,
                                         false,
                                         false);
#else
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
312
313

    return __llvm_amdgcn_buffer_load_f16(
314
315
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
316
#else
317
    return src_thread_data_valid ? p_src_wave[src_thread_data_offset] : 0;
318
#endif
Chao Liu's avatar
Chao Liu committed
319
}
320

Chao Liu's avatar
Chao Liu committed
321
template <>
322
__device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave,
Chao Liu's avatar
Chao Liu committed
323
                                              index_t src_thread_data_offset,
324
325
                                              bool src_thread_data_valid,
                                              index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
326
{
327
    BufferResourceConstant<half_t> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
328

329
330
331
332
333
334
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
    src_wave_buffer_resource.config[3] = 0x00027000;
Chao Liu's avatar
Chao Liu committed
335
336
337

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);

338
339
340
341
342
343
344
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    float dst_out_tmp =
        __llvm_amdgcn_buffer_load_f32(src_wave_buffer_resource.data,
                                      0,
                                      src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
                                      false,
                                      false);
Chao Liu's avatar
Chao Liu committed
345
#else
346
347
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
348
    float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
349
350
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
Chao Liu's avatar
Chao Liu committed
351
352

    return *reinterpret_cast<half2_t*>(&dst_out_tmp);
353
354
355
}

template <>
356
__device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave,
Chao Liu's avatar
Chao Liu committed
357
                                              index_t src_thread_data_offset,
358
359
                                              bool src_thread_data_valid,
                                              index_t src_data_range)
360
{
361
    BufferResourceConstant<half_t> src_wave_buffer_resource;
362

363
364
365
366
367
368
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
    src_wave_buffer_resource.config[3] = 0x00027000;
Chao Liu's avatar
Chao Liu committed
369
370

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
371

372
373
374
375
376
377
378
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    float2_t dst_out_tmp =
        __llvm_amdgcn_buffer_load_f32x2(src_wave_buffer_resource.data,
                                        0,
                                        src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
                                        false,
                                        false);
Chao Liu's avatar
Chao Liu committed
379
#else
380
381
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
382
    float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
383
384
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
Chao Liu's avatar
Chao Liu committed
385
386
387
388
389

    return *reinterpret_cast<half4_t*>(&dst_out_tmp);
}

template <>
390
__device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
Chao Liu's avatar
Chao Liu committed
391
                                              index_t src_thread_data_offset,
392
393
                                              bool src_thread_data_valid,
                                              index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
394
{
395
    BufferResourceConstant<half_t> src_wave_buffer_resource;
396

397
398
399
400
401
402
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
    src_wave_buffer_resource.config[3] = 0x00027000;
Chao Liu's avatar
Chao Liu committed
403
404
405

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);

406
407
408
409
410
411
412
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    float4_t dst_out_tmp =
        __llvm_amdgcn_buffer_load_f32x4(src_wave_buffer_resource.data,
                                        0,
                                        src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
                                        false,
                                        false);
Chao Liu's avatar
Chao Liu committed
413
#else
414
415
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
416
    float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
417
418
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
Chao Liu's avatar
Chao Liu committed
419
420
421
422
423

    return *reinterpret_cast<half8_t*>(&dst_out_tmp);
}

template <>
424
__device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
Chao Liu's avatar
Chao Liu committed
425
                                             index_t src_thread_data_offset,
426
427
                                             bool src_thread_data_valid,
                                             index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
428
{
429
    BufferResourceConstant<ushort> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
430

431
432
433
434
435
436
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
    src_wave_buffer_resource.config[3] = 0x00027000;
437

Chao Liu's avatar
Chao Liu committed
438
439
#if !CK_WORKAROUND_SWDEV_231101
    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
440
441
442
443
444
445
446
447
448
449

#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    return __llvm_amdgcn_buffer_load_bf16(src_wave_buffer_resource.data,
                                          0,
                                          src_thread_data_valid ? src_thread_addr_offset
                                                                : 0xffffffff,
                                          false,
                                          false);
#else
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
450
451

    return __llvm_amdgcn_buffer_load_bf16(
452
453
454
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif

455
#else
456
    return src_thread_data_valid ? p_src_wave[src_thread_data_offset] : 0;
457
#endif
Chao Liu's avatar
Chao Liu committed
458
459
460
}

template <>
461
__device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave,
Chao Liu's avatar
Chao Liu committed
462
                                                index_t src_thread_data_offset,
463
464
                                                bool src_thread_data_valid,
                                                index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
465
{
466
    BufferResourceConstant<ushort> src_wave_buffer_resource;
467

468
469
470
471
472
473
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
    src_wave_buffer_resource.config[3] = 0x00027000;
Chao Liu's avatar
Chao Liu committed
474
475
476

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);

477
478
479
480
481
482
483
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    float dst_out_tmp =
        __llvm_amdgcn_buffer_load_f32(src_wave_buffer_resource.data,
                                      0,
                                      src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
                                      false,
                                      false);
Chao Liu's avatar
Chao Liu committed
484
#else
485
486
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
487
    float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
488
489
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
Chao Liu's avatar
Chao Liu committed
490
491

    return *reinterpret_cast<ushort2_t*>(&dst_out_tmp);
492
493
494
}

template <>
495
__device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave,
Chao Liu's avatar
Chao Liu committed
496
                                                index_t src_thread_data_offset,
497
498
                                                bool src_thread_data_valid,
                                                index_t src_data_range)
499
{
500
    BufferResourceConstant<ushort> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
501

502
503
504
505
506
507
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
    src_wave_buffer_resource.config[3] = 0x00027000;
Chao Liu's avatar
Chao Liu committed
508
509
510

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);

511
512
513
514
515
516
517
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    float2_t dst_out_tmp =
        __llvm_amdgcn_buffer_load_f32x2(src_wave_buffer_resource.data,
                                        0,
                                        src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
                                        false,
                                        false);
Chao Liu's avatar
Chao Liu committed
518
#else
519
520
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
521
    float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
522
523
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
Chao Liu's avatar
Chao Liu committed
524
525
526
527
528

    return *reinterpret_cast<ushort4_t*>(&dst_out_tmp);
}

template <>
529
__device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
Chao Liu's avatar
Chao Liu committed
530
                                                index_t src_thread_data_offset,
531
532
                                                bool src_thread_data_valid,
                                                index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
533
{
534
    BufferResourceConstant<ushort> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
535

536
537
538
539
540
541
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
    src_wave_buffer_resource.config[3] = 0x00027000;
Chao Liu's avatar
Chao Liu committed
542
543
544

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);

545
546
547
548
549
550
551
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    float4_t dst_out_tmp =
        __llvm_amdgcn_buffer_load_f32x4(src_wave_buffer_resource.data,
                                        0,
                                        src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
                                        false,
                                        false);
Chao Liu's avatar
Chao Liu committed
552
#else
553
554
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
555
    float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
556
557
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
Chao Liu's avatar
Chao Liu committed
558
559
560
561
562

    return *reinterpret_cast<ushort8_t*>(&dst_out_tmp);
}

template <>
563
564
__device__ void amd_buffer_store<float, 1>(const float* p_src_thread,
                                           float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
565
                                           index_t dst_thread_data_offset,
566
567
                                           bool dst_thread_data_valid,
                                           index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
568
{
569
    BufferResourceConstant<float> dst_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
570

571
572
573
574
575
576
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
    // wavewise setting (32 bit)
    dst_wave_buffer_resource.config[3] = 0x00027000;
Chao Liu's avatar
Chao Liu committed
577

578
579
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

580
581
582
583
584
585
586
587
588
589
590
591
592
#if 1 // debug
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    __llvm_amdgcn_buffer_store_f32(*p_src_thread,
                                   dst_wave_buffer_resource.data,
                                   0,
                                   dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
                                   false,
                                   false);
#else
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_f32(*p_src_thread,
                                   dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
593
                                   0,
594
                                   dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
595
596
                                   false,
                                   false);
597
598
599
600
601
602
603
604
#endif
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32(
            *p_src_thread, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
    }
#endif
Chao Liu's avatar
Chao Liu committed
605
606
607
}

template <>
608
609
__device__ void amd_buffer_store<float, 2>(const float* p_src_thread,
                                           float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
610
                                           index_t dst_thread_data_offset,
611
612
                                           bool dst_thread_data_valid,
                                           index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
613
{
614
    BufferResourceConstant<float> dst_wave_buffer_resource;
615

616
617
618
619
620
621
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
    // wavewise setting (32 bit)
    dst_wave_buffer_resource.config[3] = 0x00027000;
622

Chao Liu's avatar
Chao Liu committed
623
624
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

625
626
627
628
629
630
631
632
633
634
635
636
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    __llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
                                     dst_wave_buffer_resource.data,
                                     0,
                                     dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
                                     false,
                                     false);
#else
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
                                     dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
637
                                     0,
638
                                     dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
639
640
                                     false,
                                     false);
641
#endif
642
643
644
}

template <>
645
646
__device__ void amd_buffer_store<float, 4>(const float* p_src_thread,
                                           float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
647
                                           index_t dst_thread_data_offset,
648
649
                                           bool dst_thread_data_valid,
                                           index_t dst_data_range)
650
{
651
    BufferResourceConstant<float> dst_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
652

653
654
655
656
657
658
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
    // wavewise setting (32 bit)
    dst_wave_buffer_resource.config[3] = 0x00027000;
Chao Liu's avatar
Chao Liu committed
659

660
661
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

662
663
664
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    __llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
                                     dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
665
                                     0,
666
                                     dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
Chao Liu's avatar
Chao Liu committed
667
668
                                     false,
                                     false);
669
670
671
672
673
674
675
676
677
678
#else
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
                                     dst_wave_buffer_resource.data,
                                     0,
                                     dst_addr_shift + dst_thread_addr_offset,
                                     false,
                                     false);
#endif
Chao Liu's avatar
Chao Liu committed
679
680
681
}

template <>
682
683
__device__ void amd_buffer_store<half_t, 1>(const half_t* p_src_thread,
                                            half_t* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
684
                                            index_t dst_thread_data_offset,
685
686
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
687
{
688
    BufferResourceConstant<half_t> dst_wave_buffer_resource;
689

690
691
692
693
694
695
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
    dst_wave_buffer_resource.config[3] = 0x00027000;
696

Chao Liu's avatar
Chao Liu committed
697
698
699
#if !CK_WORKAROUND_SWDEV_231101
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);

700
701
702
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    __llvm_amdgcn_buffer_store_f16(*p_src_thread,
                                   dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
703
                                   0,
704
                                   dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
Chao Liu's avatar
Chao Liu committed
705
706
                                   false,
                                   false);
707
#else
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_f16(*p_src_thread,
                                   dst_wave_buffer_resource.data,
                                   0,
                                   dst_addr_shift + dst_thread_addr_offset,
                                   false,
                                   false);
#endif

#else
    if(dst_thread_data_valid)
    {
        p_dst_wave[dst_thread_data_offset] = *p_src_thread;
    }
723
724
725
726
#endif
}

template <>
727
728
__device__ void amd_buffer_store<half_t, 2>(const half_t* p_src_thread,
                                            half_t* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
729
                                            index_t dst_thread_data_offset,
730
731
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
732
{
733
    BufferResourceConstant<half_t> dst_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
734

735
736
737
738
739
740
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
    dst_wave_buffer_resource.config[3] = 0x00027000;
Chao Liu's avatar
Chao Liu committed
741
742

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
743

744
745
746
747
748
749
750
751
752
    const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);

#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    __llvm_amdgcn_buffer_store_f32(*p_src_tmp,
                                   dst_wave_buffer_resource.data,
                                   0,
                                   dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
                                   false,
                                   false);
Chao Liu's avatar
Chao Liu committed
753
#else
754
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
755
756

    __llvm_amdgcn_buffer_store_f32(*p_src_tmp,
757
                                   dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
758
                                   0,
759
                                   dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
760
761
762
763
764
765
                                   false,
                                   false);
#endif
}

template <>
766
767
__device__ void amd_buffer_store<half_t, 4>(const half_t* p_src_thread,
                                            half_t* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
768
                                            index_t dst_thread_data_offset,
769
770
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
771
{
772
773
774
775
776
777
778
779
    BufferResourceConstant<half_t> dst_wave_buffer_resource;

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
    dst_wave_buffer_resource.config[3] = 0x00027000;
Chao Liu's avatar
Chao Liu committed
780

781
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
782

783
    const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
784

785
786
787
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
                                     dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
788
                                     0,
789
                                     dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
Chao Liu's avatar
Chao Liu committed
790
791
                                     false,
                                     false);
792
#else
793
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
794
795

    __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
796
                                     dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
797
                                     0,
798
                                     dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
799
800
                                     false,
                                     false);
801
802
803
#endif
}

804
template <>
805
806
__device__ void amd_buffer_store<half_t, 8>(const half_t* p_src_thread,
                                            half_t* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
807
                                            index_t dst_thread_data_offset,
808
809
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
810
{
811
812
813
814
815
816
817
818
819
820
    BufferResourceConstant<half_t> dst_wave_buffer_resource;

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
    dst_wave_buffer_resource.config[3] = 0x00027000;

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
Chao Liu's avatar
Chao Liu committed
821

822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
    const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);

#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
                                     dst_wave_buffer_resource.data,
                                     0,
                                     dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
                                     false,
                                     false);
#else
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
                                     dst_wave_buffer_resource.data,
                                     0,
                                     dst_addr_shift + dst_thread_addr_offset,
                                     false,
                                     false);
#endif
}

template <>
__device__ void amd_buffer_store<ushort, 1>(const ushort* p_src_thread,
                                            ushort* p_dst_wave,
                                            index_t dst_thread_data_offset,
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
{
    BufferResourceConstant<ushort> dst_wave_buffer_resource;

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
    dst_wave_buffer_resource.config[3] = 0x00027000;
Chao Liu's avatar
Chao Liu committed
858
859
860
861

#if !CK_WORKAROUND_SWDEV_231101
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);

862
863
864
865
866
867
868
869
870
871
872
873
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    __llvm_amdgcn_buffer_store_bf16(*p_src_thread,
                                    dst_wave_buffer_resource.data,
                                    0,
                                    dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
                                    false,
                                    false);
#else
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_bf16(*p_src_thread,
                                    dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
874
                                    0,
875
                                    dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
876
877
                                    false,
                                    false);
878
879
#endif

Chao Liu's avatar
Chao Liu committed
880
#else
881
882
883
884
    if(dst_thread_data_valid)
    {
        p_dst_wave[dst_thread_data_offset] = *p_src_thread;
    }
Chao Liu's avatar
Chao Liu committed
885
886
887
888
#endif
}

template <>
889
890
__device__ void amd_buffer_store<ushort, 2>(const ushort* p_src_thread,
                                            ushort* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
891
                                            index_t dst_thread_data_offset,
892
893
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
894
{
895
    BufferResourceConstant<ushort> dst_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
896

897
898
899
900
901
902
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
    dst_wave_buffer_resource.config[3] = 0x00027000;
903

Chao Liu's avatar
Chao Liu committed
904
905
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);

906
907
908
909
910
911
912
913
914
    const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);

#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    __llvm_amdgcn_buffer_store_f32(*p_src_tmp,
                                   dst_wave_buffer_resource.data,
                                   0,
                                   dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
                                   false,
                                   false);
Chao Liu's avatar
Chao Liu committed
915
#else
916
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
917
918

    __llvm_amdgcn_buffer_store_f32(*p_src_tmp,
919
                                   dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
920
                                   0,
921
                                   dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
922
923
924
925
926
927
                                   false,
                                   false);
#endif
}

template <>
928
929
__device__ void amd_buffer_store<ushort, 4>(const ushort* p_src_thread,
                                            ushort* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
930
                                            index_t dst_thread_data_offset,
931
932
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
933
{
934
    BufferResourceConstant<ushort> dst_wave_buffer_resource;
935

936
937
938
939
940
941
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
    dst_wave_buffer_resource.config[3] = 0x00027000;
942

Chao Liu's avatar
Chao Liu committed
943
944
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);

945
946
947
948
949
950
951
952
953
    const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);

#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
                                     dst_wave_buffer_resource.data,
                                     0,
                                     dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
                                     false,
                                     false);
954
#else
955
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
956
957

    __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
958
                                     dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
959
                                     0,
960
                                     dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
961
962
                                     false,
                                     false);
963
964
965
#endif
}

Chao Liu's avatar
Chao Liu committed
966
template <>
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
__device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
                                            ushort* p_dst_wave,
                                            index_t dst_thread_data_offset,
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
{
    BufferResourceConstant<ushort> dst_wave_buffer_resource;

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
    dst_wave_buffer_resource.config[3] = 0x00027000;

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);

    const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);

#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
                                     dst_wave_buffer_resource.data,
                                     0,
                                     dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
                                     false,
                                     false);
#else
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
                                     dst_wave_buffer_resource.data,
                                     0,
                                     dst_addr_shift + dst_thread_addr_offset,
                                     false,
                                     false);
#endif
}

template <>
__device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
                                                float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
1008
                                                index_t dst_thread_data_offset,
1009
1010
                                                bool dst_thread_data_valid,
                                                index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
1011
{
1012
    BufferResourceConstant<float> dst_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
1013

1014
1015
1016
1017
1018
1019
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
    // wavewise setting (32 bit)
    dst_wave_buffer_resource.config[3] = 0x00027000;
Chao Liu's avatar
Chao Liu committed
1020
1021
1022

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    __llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread,
                                        dst_wave_buffer_resource.data,
                                        0,
                                        dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
                                        false);
#else
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread,
                                        dst_wave_buffer_resource.data,
                                        0,
                                        dst_addr_shift + dst_thread_addr_offset,
                                        false);
#endif
Chao Liu's avatar
Chao Liu committed
1038
1039
1040
}

template <>
1041
1042
__device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
                                                float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
1043
                                                index_t dst_thread_data_offset,
1044
1045
                                                bool dst_thread_data_valid,
                                                index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
1046
{
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
    BufferResourceConstant<float> dst_wave_buffer_resource;

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range;
    // wavewise setting (32 bit)
    dst_wave_buffer_resource.config[3] = 0x00027000;

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
    for(index_t i = 0; i < 2; ++i)
    {
        __llvm_amdgcn_buffer_atomic_add_f32(
            p_src_thread[i],
            dst_wave_buffer_resource.data,
            0,
            dst_thread_data_valid ? (dst_thread_addr_offset + i * sizeof(float)) : 0xffffffff,
            false);
    }
#else
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
1071
1072
    for(index_t i = 0; i < 2; ++i)
    {
1073
1074
1075
1076
1077
1078
        __llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
                                            dst_wave_buffer_resource.data,
                                            0,
                                            dst_addr_shift + dst_thread_addr_offset +
                                                i * sizeof(float),
                                            false);
Chao Liu's avatar
Chao Liu committed
1079
    }
1080
#endif
Chao Liu's avatar
Chao Liu committed
1081
1082
1083
}

template <>
1084
1085
__device__ void amd_buffer_atomic_add<float, 4>(const float* p_src_thread,
                                                float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
1086
                                                index_t dst_thread_data_offset,
1087
1088
                                                bool dst_thread_data_valid,
                                                index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
1089
{
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
    BufferResourceConstant<float> dst_wave_buffer_resource;

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
    // wavewise setting (32 bit)
    dst_wave_buffer_resource.config[3] = 0x00027000;

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
Chao Liu's avatar
Chao Liu committed
1102
1103
    for(index_t i = 0; i < 4; ++i)
    {
1104
1105
1106
1107
1108
1109
        __llvm_amdgcn_buffer_atomic_add_f32(
            p_src_thread[i],
            dst_wave_buffer_resource.data,
            0,
            dst_thread_data_valid ? (dst_thread_addr_offset + i * sizeof(float)) : 0xffffffff,
            false);
Chao Liu's avatar
Chao Liu committed
1110
    }
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
#else
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    for(index_t i = 0; i < 4; ++i)
    {
        __llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
                                            dst_wave_buffer_resource.data,
                                            0,
                                            dst_addr_shift + dst_thread_addr_offset +
                                                i * sizeof(float),
                                            false);
    }
#endif
Chao Liu's avatar
Chao Liu committed
1124
1125
}

1126
1127
} // namespace ck
#endif