amd_buffer_addressing.hpp 42.3 KB
Newer Older
1
2
3
4
#ifndef CK_AMD_BUFFER_ADDRESSING_HPP
#define CK_AMD_BUFFER_ADDRESSING_HPP

#include "float_type.hpp"
Chao Liu's avatar
Chao Liu committed
5
#include "amd_buffer_addressing_v2.hpp"
6
7
8

namespace ck {

9
10
11
12
13
14
15
16
17
18
19
template <typename T>
union BufferResource
{
    // 128 bit SGPRs to supply buffer resource in buffer instructions
    // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
    int32x4_t data;
    T* address[2];
    int32_t range[4];
    int32_t config[4];
};

20
__device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
21
22
23
24
                                               index_t vindex,
                                               index_t offset,
                                               bool glc,
                                               bool slc) __asm("llvm.amdgcn.buffer.load.f32");
25

Chao Liu's avatar
Chao Liu committed
26
__device__ float2_t
27
__llvm_amdgcn_buffer_load_f32x2(int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
28
29
30
31
32
33
                                index_t vindex,
                                index_t offset,
                                bool glc,
                                bool slc) __asm("llvm.amdgcn.buffer.load.v2f32");

__device__ float4_t
34
__llvm_amdgcn_buffer_load_f32x4(int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
35
36
37
38
                                index_t vindex,
                                index_t offset,
                                bool glc,
                                bool slc) __asm("llvm.amdgcn.buffer.load.v4f32");
Chao Liu's avatar
Chao Liu committed
39
40
41
42
43
44
45
46
47
48
49
__device__ half_t
__llvm_amdgcn_raw_buffer_load_f16(int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");

__device__ ushort
__llvm_amdgcn_raw_buffer_load_bf16(int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.bf16");
Chao Liu's avatar
Chao Liu committed
50
51

__device__ void __llvm_amdgcn_buffer_store_f32(float vdata,
52
                                               int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
53
54
55
56
57
58
                                               index_t vindex,
                                               index_t offset,
                                               bool glc,
                                               bool slc) __asm("llvm.amdgcn.buffer.store.f32");

__device__ void __llvm_amdgcn_buffer_store_f32x2(float2_t vdata,
59
                                                 int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
60
61
62
63
64
65
                                                 index_t vindex,
                                                 index_t offset,
                                                 bool glc,
                                                 bool slc) __asm("llvm.amdgcn.buffer.store.v2f32");

__device__ void __llvm_amdgcn_buffer_store_f32x4(float4_t vdata,
66
                                                 int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
67
68
69
70
71
                                                 index_t vindex,
                                                 index_t offset,
                                                 bool glc,
                                                 bool slc) __asm("llvm.amdgcn.buffer.store.v4f32");

72
__device__ void
Chao Liu's avatar
Chao Liu committed
73
74
75
76
77
__llvm_amdgcn_raw_buffer_store_f16(half_t vdata,
                                   int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
Chao Liu's avatar
Chao Liu committed
78
79

__device__ void
Chao Liu's avatar
Chao Liu committed
80
81
82
83
84
85
86
87
88
89
90
__llvm_amdgcn_raw_buffer_store_bf16(ushort vdata,
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.bf16");

#if CK_USE_AMD_BUFFER_ATOMIC_FADD
#if CK_HIP_VERSION_FLAT >= 3010020405
// starting ROCm-3.10, the return type becomes float
__device__ float
#else
Chao Liu's avatar
Chao Liu committed
91
__device__ void
Chao Liu's avatar
Chao Liu committed
92
#endif
Chao Liu's avatar
Chao Liu committed
93
__llvm_amdgcn_buffer_atomic_add_f32(float vdata,
Chao Liu's avatar
Chao Liu committed
94
                                    int32x4_t rsrc,
Chao Liu's avatar
Chao Liu committed
95
96
97
                                    index_t vindex,
                                    index_t offset,
                                    bool slc) __asm("llvm.amdgcn.buffer.atomic.fadd.f32");
Chao Liu's avatar
Chao Liu committed
98
#endif
99

Chao Liu's avatar
Chao Liu committed
100
// buffer_load requires:
Chao Liu's avatar
Chao Liu committed
101
102
//   1) p_src_wave must be in global memory space
//   2) p_src_wave to be a wavewise pointer.
Chao Liu's avatar
Chao Liu committed
103
// It is user's responsibility to make sure that is true.
104
template <typename T, index_t VectorSize>
Chao Liu's avatar
Chao Liu committed
105
106
107
108
__device__ typename vector_type<T, VectorSize>::type amd_buffer_load(const T* p_src_wave,
                                                                     index_t src_thread_data_offset,
                                                                     bool src_thread_data_valid,
                                                                     index_t src_elemenst_space);
109

Chao Liu's avatar
Chao Liu committed
110
// buffer_store requires:
111
112
//   1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
//   2) p_dst_thread to be a wavewise pointer.
Chao Liu's avatar
Chao Liu committed
113
// It is user's responsibility to make sure that is true.
114
template <typename T, index_t VectorSize>
115
116
__device__ void amd_buffer_store(const T* p_src_thread,
                                 T* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
117
                                 index_t dst_thread_data_offset,
118
119
                                 bool dst_thread_data_valid,
                                 index_t dst_data_range);
120

121
122
123
124
// buffer_atomic requires:
//   1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
//   2) p_dst_thread to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
125
template <typename T, index_t VectorSize>
126
127
__device__ void amd_buffer_atomic_add(const T* p_src_thread,
                                      T* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
128
                                      index_t dst_thread_data_offset,
129
130
                                      bool dst_thread_data_valid,
                                      index_t dst_data_range);
131

132
template <>
133
__device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
Chao Liu's avatar
Chao Liu committed
134
                                           index_t src_thread_data_offset,
135
136
                                           bool src_thread_data_valid,
                                           index_t src_data_range)
137
{
Chao Liu's avatar
Chao Liu committed
138
    BufferResource<float> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
139

140
141
142
143
144
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
145
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
146
147

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
148

Chao Liu's avatar
Chao Liu committed
149
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
150
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
151

Chao Liu's avatar
Chao Liu committed
152
    return __llvm_amdgcn_buffer_load_f32(
153
154
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else
Chao Liu's avatar
Chao Liu committed
155
156
157
158
    float tmp = __llvm_amdgcn_buffer_load_f32(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? tmp : float(0);
159
#endif
Chao Liu's avatar
Chao Liu committed
160
161
162
}

template <>
163
__device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave,
Chao Liu's avatar
Chao Liu committed
164
                                              index_t src_thread_data_offset,
165
166
                                              bool src_thread_data_valid,
                                              index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
167
{
Chao Liu's avatar
Chao Liu committed
168
    BufferResource<float> src_wave_buffer_resource;
169

170
171
172
173
174
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
175
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
176

Chao Liu's avatar
Chao Liu committed
177
    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
178

Chao Liu's avatar
Chao Liu committed
179
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
180
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
181

Chao Liu's avatar
Chao Liu committed
182
    return __llvm_amdgcn_buffer_load_f32x2(
183
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
Chao Liu's avatar
Chao Liu committed
184
185
186
187
188
#else
    float2_t tmp = __llvm_amdgcn_buffer_load_f32x2(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? tmp : float2_t(0);
189
#endif
190
191
192
}

template <>
193
__device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave,
Chao Liu's avatar
Chao Liu committed
194
                                              index_t src_thread_data_offset,
195
196
                                              bool src_thread_data_valid,
                                              index_t src_data_range)
197
{
Chao Liu's avatar
Chao Liu committed
198
    BufferResource<float> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
199

200
201
202
203
204
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
205
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
206
207

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
208

Chao Liu's avatar
Chao Liu committed
209
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
210
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
211

Chao Liu's avatar
Chao Liu committed
212
    return __llvm_amdgcn_buffer_load_f32x4(
213
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
Chao Liu's avatar
Chao Liu committed
214
215
216
217
218
#else
    float4_t tmp = __llvm_amdgcn_buffer_load_f32x4(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? tmp : float4_t(0);
219
#endif
Chao Liu's avatar
Chao Liu committed
220
221
222
}

template <>
223
__device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
Chao Liu's avatar
Chao Liu committed
224
                                             index_t src_thread_data_offset,
225
226
                                             bool src_thread_data_valid,
                                             index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
227
{
Chao Liu's avatar
Chao Liu committed
228
    BufferResource<half_t> src_wave_buffer_resource;
229

230
231
232
233
234
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
235
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
236

Chao Liu's avatar
Chao Liu committed
237
    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
238

Chao Liu's avatar
Chao Liu committed
239
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
240
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
241

Chao Liu's avatar
Chao Liu committed
242
243
244
245
    // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
    // everything is passed to Voffset
    return __llvm_amdgcn_raw_buffer_load_f16(
        src_wave_buffer_resource.data, src_addr_shift + src_thread_addr_offset, 0, 0);
246
#else
Chao Liu's avatar
Chao Liu committed
247
248
249
250
251
252
253
    half_t zero(0);

    // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
    // everything is passed to Voffset
    return src_thread_data_valid ? __llvm_amdgcn_raw_buffer_load_f16(
                                       src_wave_buffer_resource.data, src_thread_addr_offset, 0, 0)
                                 : zero;
254
#endif
Chao Liu's avatar
Chao Liu committed
255
}
256

Chao Liu's avatar
Chao Liu committed
257
template <>
258
__device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave,
Chao Liu's avatar
Chao Liu committed
259
                                              index_t src_thread_data_offset,
260
261
                                              bool src_thread_data_valid,
                                              index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
262
{
Chao Liu's avatar
Chao Liu committed
263
    BufferResource<half_t> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
264

265
266
267
268
269
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
270
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
271
272
273

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);

Chao Liu's avatar
Chao Liu committed
274
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
275
276
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
277
    float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
278
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
Chao Liu's avatar
Chao Liu committed
279
280

    return *reinterpret_cast<half2_t*>(&dst_out_tmp);
Chao Liu's avatar
Chao Liu committed
281
282
283
284
285
286
287
288
#else
    half2_t zeros(0);

    float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? *reinterpret_cast<half2_t*>(&dst_out_tmp) : zeros;
#endif
289
290
291
}

template <>
292
__device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave,
Chao Liu's avatar
Chao Liu committed
293
                                              index_t src_thread_data_offset,
294
295
                                              bool src_thread_data_valid,
                                              index_t src_data_range)
296
{
Chao Liu's avatar
Chao Liu committed
297
    BufferResource<half_t> src_wave_buffer_resource;
298

299
300
301
302
303
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
304
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
305
306

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
307

Chao Liu's avatar
Chao Liu committed
308
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
309
310
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
311
    float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
312
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
Chao Liu's avatar
Chao Liu committed
313
314

    return *reinterpret_cast<half4_t*>(&dst_out_tmp);
Chao Liu's avatar
Chao Liu committed
315
316
317
318
319
320
321
322
#else
    half4_t zeros(0);

    float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? *reinterpret_cast<half4_t*>(&dst_out_tmp) : zeros;
#endif
Chao Liu's avatar
Chao Liu committed
323
324
325
}

template <>
326
__device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
Chao Liu's avatar
Chao Liu committed
327
                                              index_t src_thread_data_offset,
328
329
                                              bool src_thread_data_valid,
                                              index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
330
{
Chao Liu's avatar
Chao Liu committed
331
    BufferResource<half_t> src_wave_buffer_resource;
332

333
334
335
336
337
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
338
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
339
340
341

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);

Chao Liu's avatar
Chao Liu committed
342
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
343
344
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
345
    float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
346
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
Chao Liu's avatar
Chao Liu committed
347
348

    return *reinterpret_cast<half8_t*>(&dst_out_tmp);
Chao Liu's avatar
Chao Liu committed
349
350
351
352
353
354
355
356
#else
    half8_t zeros(0);

    float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? *reinterpret_cast<half8_t*>(&dst_out_tmp) : zeros;
#endif
Chao Liu's avatar
Chao Liu committed
357
358
359
}

template <>
360
__device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
Chao Liu's avatar
Chao Liu committed
361
                                             index_t src_thread_data_offset,
362
363
                                             bool src_thread_data_valid,
                                             index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
364
{
Chao Liu's avatar
Chao Liu committed
365
    BufferResource<ushort> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
366

367
368
369
370
371
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
372
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
373

Chao Liu's avatar
Chao Liu committed
374
    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
375

Chao Liu's avatar
Chao Liu committed
376
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
377
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
378

Chao Liu's avatar
Chao Liu committed
379
380
381
382
    // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
    // everything is passed to Voffset
    return __llvm_amdgcn_raw_buffer_load_bf16(
        src_wave_buffer_resource.data, src_addr_shift + src_thread_addr_offset, 0, 0);
383
#else
Chao Liu's avatar
Chao Liu committed
384
385
386
387
388
389
390
    ushort zero(0);

    // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
    // everything is passed to Voffset
    return src_thread_data_valid ? __llvm_amdgcn_raw_buffer_load_bf16(
                                       src_wave_buffer_resource.data, src_thread_addr_offset, 0, 0)
                                 : zero;
391
#endif
Chao Liu's avatar
Chao Liu committed
392
393
394
}

template <>
395
__device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave,
Chao Liu's avatar
Chao Liu committed
396
                                                index_t src_thread_data_offset,
397
398
                                                bool src_thread_data_valid,
                                                index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
399
{
Chao Liu's avatar
Chao Liu committed
400
    BufferResource<ushort> src_wave_buffer_resource;
401

402
403
404
405
406
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
407
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
408
409
410

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);

Chao Liu's avatar
Chao Liu committed
411
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
412
413
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
414
    float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
415
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
Chao Liu's avatar
Chao Liu committed
416
417

    return *reinterpret_cast<ushort2_t*>(&dst_out_tmp);
Chao Liu's avatar
Chao Liu committed
418
419
420
421
422
423
424
425
#else
    ushort2_t zeros(0);

    float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? *reinterpret_cast<ushort2_t*>(&dst_out_tmp) : zeros;
#endif
426
427
428
}

template <>
429
__device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave,
Chao Liu's avatar
Chao Liu committed
430
                                                index_t src_thread_data_offset,
431
432
                                                bool src_thread_data_valid,
                                                index_t src_data_range)
433
{
Chao Liu's avatar
Chao Liu committed
434
    BufferResource<ushort> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
435

436
437
438
439
440
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
441
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
442
443
444

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);

Chao Liu's avatar
Chao Liu committed
445
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
446
447
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
448
    float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
449
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
Chao Liu's avatar
Chao Liu committed
450
451

    return *reinterpret_cast<ushort4_t*>(&dst_out_tmp);
Chao Liu's avatar
Chao Liu committed
452
453
454
455
456
457
458
459
#else
    ushort4_t zeros(0);

    float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? *reinterpret_cast<ushort4_t*>(&dst_out_tmp) : zeros;
#endif
Chao Liu's avatar
Chao Liu committed
460
461
462
}

template <>
463
__device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
Chao Liu's avatar
Chao Liu committed
464
                                                index_t src_thread_data_offset,
465
466
                                                bool src_thread_data_valid,
                                                index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
467
{
Chao Liu's avatar
Chao Liu committed
468
    BufferResource<ushort> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
469

470
471
472
473
474
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
475
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
476
477
478

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);

Chao Liu's avatar
Chao Liu committed
479
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
480
481
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
482
    float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
483
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
Chao Liu's avatar
Chao Liu committed
484
485

    return *reinterpret_cast<ushort8_t*>(&dst_out_tmp);
Chao Liu's avatar
Chao Liu committed
486
487
488
489
490
491
492
493
#else
    ushort8_t zeros(0);

    float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? *reinterpret_cast<ushort8_t*>(&dst_out_tmp) : zeros;
#endif
Chao Liu's avatar
Chao Liu committed
494
495
496
}

template <>
497
498
__device__ void amd_buffer_store<float, 1>(const float* p_src_thread,
                                           float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
499
                                           index_t dst_thread_data_offset,
500
501
                                           bool dst_thread_data_valid,
                                           index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
502
{
Chao Liu's avatar
Chao Liu committed
503
    BufferResource<float> dst_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
504

505
506
507
508
509
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
510
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
511

512
513
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

Chao Liu's avatar
Chao Liu committed
514
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
515
516
517
518
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_f32(*p_src_thread,
                                   dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
519
                                   0,
520
                                   dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
521
522
                                   false,
                                   false);
523
524
525
526
527
528
529
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32(
            *p_src_thread, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
    }
#endif
Chao Liu's avatar
Chao Liu committed
530
531
532
}

template <>
533
534
__device__ void amd_buffer_store<float, 2>(const float* p_src_thread,
                                           float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
535
                                           index_t dst_thread_data_offset,
536
537
                                           bool dst_thread_data_valid,
                                           index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
538
{
Chao Liu's avatar
Chao Liu committed
539
    BufferResource<float> dst_wave_buffer_resource;
540

541
542
543
544
545
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
546
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
547

Chao Liu's avatar
Chao Liu committed
548
549
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

Chao Liu's avatar
Chao Liu committed
550
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
551
552
553
554
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
                                     dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
555
                                     0,
556
                                     dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
557
558
                                     false,
                                     false);
Chao Liu's avatar
Chao Liu committed
559
560
561
562
563
564
565
566
567
568
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
                                         dst_wave_buffer_resource.data,
                                         0,
                                         dst_thread_addr_offset,
                                         false,
                                         false);
    }
569
#endif
570
571
572
}

template <>
573
574
__device__ void amd_buffer_store<float, 4>(const float* p_src_thread,
                                           float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
575
                                           index_t dst_thread_data_offset,
576
577
                                           bool dst_thread_data_valid,
                                           index_t dst_data_range)
578
{
Chao Liu's avatar
Chao Liu committed
579
    BufferResource<float> dst_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
580

581
582
583
584
585
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
586
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
587

588
589
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

Chao Liu's avatar
Chao Liu committed
590
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
591
592
593
594
595
596
597
598
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
                                     dst_wave_buffer_resource.data,
                                     0,
                                     dst_addr_shift + dst_thread_addr_offset,
                                     false,
                                     false);
Chao Liu's avatar
Chao Liu committed
599
600
601
602
603
604
605
606
607
608
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
                                         dst_wave_buffer_resource.data,
                                         0,
                                         dst_thread_addr_offset,
                                         false,
                                         false);
    }
609
#endif
Chao Liu's avatar
Chao Liu committed
610
611
612
}

template <>
613
614
__device__ void amd_buffer_store<half_t, 1>(const half_t* p_src_thread,
                                            half_t* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
615
                                            index_t dst_thread_data_offset,
616
617
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
618
{
Chao Liu's avatar
Chao Liu committed
619
    BufferResource<half_t> dst_wave_buffer_resource;
620

621
622
623
624
625
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
626
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
627

Chao Liu's avatar
Chao Liu committed
628
629
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);

Chao Liu's avatar
Chao Liu committed
630
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
631
632
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
633
634
635
636
637
638
639
    // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
    // everything is passed to Voffset
    __llvm_amdgcn_raw_buffer_store_f16(*p_src_thread,
                                       dst_wave_buffer_resource.data,
                                       dst_addr_shift + dst_thread_addr_offset,
                                       0,
                                       0);
640
641
642
#else
    if(dst_thread_data_valid)
    {
Chao Liu's avatar
Chao Liu committed
643
644
645
646
        // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
        // everything is passed to Voffset
        __llvm_amdgcn_raw_buffer_store_f16(
            *p_src_thread, dst_wave_buffer_resource.data, dst_thread_addr_offset, 0, 0);
647
    }
648
649
650
651
#endif
}

template <>
652
653
__device__ void amd_buffer_store<half_t, 2>(const half_t* p_src_thread,
                                            half_t* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
654
                                            index_t dst_thread_data_offset,
655
656
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
657
{
Chao Liu's avatar
Chao Liu committed
658
    BufferResource<half_t> dst_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
659

660
661
662
663
664
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
665
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
666
667

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
668

669
670
    const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);

Chao Liu's avatar
Chao Liu committed
671
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
672
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
673
674

    __llvm_amdgcn_buffer_store_f32(*p_src_tmp,
675
                                   dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
676
                                   0,
677
                                   dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
678
679
                                   false,
                                   false);
Chao Liu's avatar
Chao Liu committed
680
681
682
683
684
685
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32(
            *p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
    }
Chao Liu's avatar
Chao Liu committed
686
687
688
689
#endif
}

template <>
690
691
__device__ void amd_buffer_store<half_t, 4>(const half_t* p_src_thread,
                                            half_t* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
692
                                            index_t dst_thread_data_offset,
693
694
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
695
{
Chao Liu's avatar
Chao Liu committed
696
    BufferResource<half_t> dst_wave_buffer_resource;
697
698
699
700
701
702

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
703
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
704

705
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
706

707
    const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
708

Chao Liu's avatar
Chao Liu committed
709
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
710
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
711
712

    __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
713
                                     dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
714
                                     0,
715
                                     dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
716
717
                                     false,
                                     false);
Chao Liu's avatar
Chao Liu committed
718
719
720
721
722
723
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32x2(
            *p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
    }
724
725
726
#endif
}

727
template <>
728
729
__device__ void amd_buffer_store<half_t, 8>(const half_t* p_src_thread,
                                            half_t* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
730
                                            index_t dst_thread_data_offset,
731
732
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
733
{
Chao Liu's avatar
Chao Liu committed
734
    BufferResource<half_t> dst_wave_buffer_resource;
735
736
737
738
739
740

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
741
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
742
743

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
Chao Liu's avatar
Chao Liu committed
744

745
746
    const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);

Chao Liu's avatar
Chao Liu committed
747
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
748
749
750
751
752
753
754
755
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
                                     dst_wave_buffer_resource.data,
                                     0,
                                     dst_addr_shift + dst_thread_addr_offset,
                                     false,
                                     false);
Chao Liu's avatar
Chao Liu committed
756
757
758
759
760
761
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32x4(
            *p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
    }
762
763
764
765
766
767
768
769
770
771
#endif
}

template <>
__device__ void amd_buffer_store<ushort, 1>(const ushort* p_src_thread,
                                            ushort* p_dst_wave,
                                            index_t dst_thread_data_offset,
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
{
Chao Liu's avatar
Chao Liu committed
772
    BufferResource<ushort> dst_wave_buffer_resource;
773
774
775
776
777
778

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
779
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
780
781
782

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);

Chao Liu's avatar
Chao Liu committed
783
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
784
785
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
786
787
788
789
790
    __llvm_amdgcn_raw_buffer_store_bf16(*p_src_thread,
                                        dst_wave_buffer_resource.data,
                                        dst_addr_shift + dst_thread_addr_offset,
                                        0,
                                        0);
Chao Liu's avatar
Chao Liu committed
791
#else
792
793
    if(dst_thread_data_valid)
    {
Chao Liu's avatar
Chao Liu committed
794
795
        __llvm_amdgcn_raw_buffer_store_bf16(
            *p_src_thread, dst_wave_buffer_resource.data, dst_thread_addr_offset, 0, 0);
796
    }
Chao Liu's avatar
Chao Liu committed
797
798
799
800
#endif
}

template <>
801
802
__device__ void amd_buffer_store<ushort, 2>(const ushort* p_src_thread,
                                            ushort* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
803
                                            index_t dst_thread_data_offset,
804
805
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
806
{
Chao Liu's avatar
Chao Liu committed
807
    BufferResource<ushort> dst_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
808

809
810
811
812
813
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
814
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
815

Chao Liu's avatar
Chao Liu committed
816
817
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);

818
819
    const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);

Chao Liu's avatar
Chao Liu committed
820
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
821
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
822
823

    __llvm_amdgcn_buffer_store_f32(*p_src_tmp,
824
                                   dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
825
                                   0,
826
                                   dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
827
828
                                   false,
                                   false);
Chao Liu's avatar
Chao Liu committed
829
830
831
832
833
834
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32(
            *p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
    }
Chao Liu's avatar
Chao Liu committed
835
836
837
838
#endif
}

template <>
839
840
__device__ void amd_buffer_store<ushort, 4>(const ushort* p_src_thread,
                                            ushort* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
841
                                            index_t dst_thread_data_offset,
842
843
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
844
{
Chao Liu's avatar
Chao Liu committed
845
    BufferResource<ushort> dst_wave_buffer_resource;
846

847
848
849
850
851
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
852
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
853

Chao Liu's avatar
Chao Liu committed
854
855
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);

856
857
    const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);

Chao Liu's avatar
Chao Liu committed
858
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
859
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
860
861

    __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
862
                                     dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
863
                                     0,
864
                                     dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
865
866
                                     false,
                                     false);
Chao Liu's avatar
Chao Liu committed
867
868
869
870
871
872
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32x2(
            *p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
    }
873
874
875
#endif
}

Chao Liu's avatar
Chao Liu committed
876
template <>
877
878
879
880
881
882
__device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
                                            ushort* p_dst_wave,
                                            index_t dst_thread_data_offset,
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
{
Chao Liu's avatar
Chao Liu committed
883
    BufferResource<ushort> dst_wave_buffer_resource;
884
885
886
887
888
889

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
890
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
891
892
893
894
895

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);

    const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);

Chao Liu's avatar
Chao Liu committed
896
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
897
898
899
900
901
902
903
904
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
                                     dst_wave_buffer_resource.data,
                                     0,
                                     dst_addr_shift + dst_thread_addr_offset,
                                     false,
                                     false);
Chao Liu's avatar
Chao Liu committed
905
906
907
908
909
910
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32x4(
            *p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
    }
911
912
913
#endif
}

Chao Liu's avatar
Chao Liu committed
914
#if CK_USE_AMD_BUFFER_ATOMIC_FADD
915
916
917
template <>
__device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
                                                float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
918
                                                index_t dst_thread_data_offset,
919
920
                                                bool dst_thread_data_valid,
                                                index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
921
{
Chao Liu's avatar
Chao Liu committed
922
    BufferResource<float> dst_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
923

924
925
926
927
928
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
929
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
930
931
932

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

Chao Liu's avatar
Chao Liu committed
933
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
934
935
936
937
938
939
940
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread,
                                        dst_wave_buffer_resource.data,
                                        0,
                                        dst_addr_shift + dst_thread_addr_offset,
                                        false);
Chao Liu's avatar
Chao Liu committed
941
942
943
944
945
946
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_atomic_add_f32(
            *p_src_thread, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false);
    }
947
#endif
Chao Liu's avatar
Chao Liu committed
948
949
950
}

template <>
951
952
__device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
                                                float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
953
                                                index_t dst_thread_data_offset,
954
955
                                                bool dst_thread_data_valid,
                                                index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
956
{
Chao Liu's avatar
Chao Liu committed
957
    BufferResource<float> dst_wave_buffer_resource;
958
959
960
961
962
963

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range;
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
964
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
965
966
967

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

Chao Liu's avatar
Chao Liu committed
968
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
969
970
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
971
972
    for(index_t i = 0; i < 2; ++i)
    {
973
974
975
976
977
978
        __llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
                                            dst_wave_buffer_resource.data,
                                            0,
                                            dst_addr_shift + dst_thread_addr_offset +
                                                i * sizeof(float),
                                            false);
Chao Liu's avatar
Chao Liu committed
979
    }
Chao Liu's avatar
Chao Liu committed
980
981
982
983
984
985
986
987
988
989
990
991
#else
    if(dst_thread_data_valid)
    {
        for(index_t i = 0; i < 2; ++i)
        {
            __llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
                                                dst_wave_buffer_resource.data,
                                                0,
                                                dst_thread_addr_offset + i * sizeof(float),
                                                false);
        }
    }
992
#endif
Chao Liu's avatar
Chao Liu committed
993
994
995
}

template <>
996
997
__device__ void amd_buffer_atomic_add<float, 4>(const float* p_src_thread,
                                                float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
998
                                                index_t dst_thread_data_offset,
999
1000
                                                bool dst_thread_data_valid,
                                                index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
1001
{
Chao Liu's avatar
Chao Liu committed
1002
    BufferResource<float> dst_wave_buffer_resource;
1003
1004
1005
1006
1007
1008

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
1009
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
1010
1011
1012

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

Chao Liu's avatar
Chao Liu committed
1013
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    for(index_t i = 0; i < 4; ++i)
    {
        __llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
                                            dst_wave_buffer_resource.data,
                                            0,
                                            dst_addr_shift + dst_thread_addr_offset +
                                                i * sizeof(float),
                                            false);
    }
Chao Liu's avatar
Chao Liu committed
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
#else
    if(dst_thread_data_valid)
    {
        for(index_t i = 0; i < 4; ++i)
        {
            __llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
                                                dst_wave_buffer_resource.data,
                                                0,
                                                dst_thread_addr_offset + i * sizeof(float),
                                                false);
        }
    }
1037
#endif
Chao Liu's avatar
Chao Liu committed
1038
}
Chao Liu's avatar
Chao Liu committed
1039
#endif // CK_USE_AMD_BUFFER_ATOMIC_FADD
Chao Liu's avatar
Chao Liu committed
1040

1041
1042
} // namespace ck
#endif