"router/vscode:/vscode.git/clone" did not exist on "f848decee615ee10b78510b62036021a075dbf7b"
amd_buffer_addressing.hpp 42 KB
Newer Older
1
2
3
4
#ifndef CK_AMD_BUFFER_ADDRESSING_HPP
#define CK_AMD_BUFFER_ADDRESSING_HPP

#include "float_type.hpp"
Chao Liu's avatar
Chao Liu committed
5
#include "amd_buffer_addressing_v2.hpp"
6
7
8

namespace ck {

9
__device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
10
11
12
13
                                               index_t vindex,
                                               index_t offset,
                                               bool glc,
                                               bool slc) __asm("llvm.amdgcn.buffer.load.f32");
14

Chao Liu's avatar
Chao Liu committed
15
__device__ float2_t
16
__llvm_amdgcn_buffer_load_f32x2(int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
17
18
19
20
21
22
                                index_t vindex,
                                index_t offset,
                                bool glc,
                                bool slc) __asm("llvm.amdgcn.buffer.load.v2f32");

__device__ float4_t
23
__llvm_amdgcn_buffer_load_f32x4(int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
24
25
26
27
                                index_t vindex,
                                index_t offset,
                                bool glc,
                                bool slc) __asm("llvm.amdgcn.buffer.load.v4f32");
Chao Liu's avatar
Chao Liu committed
28
29
30
31
32
33
34
35
36
37
38
__device__ half_t
__llvm_amdgcn_raw_buffer_load_f16(int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");

__device__ ushort
__llvm_amdgcn_raw_buffer_load_bf16(int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.bf16");
Chao Liu's avatar
Chao Liu committed
39
40

__device__ void __llvm_amdgcn_buffer_store_f32(float vdata,
41
                                               int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
42
43
44
45
46
47
                                               index_t vindex,
                                               index_t offset,
                                               bool glc,
                                               bool slc) __asm("llvm.amdgcn.buffer.store.f32");

__device__ void __llvm_amdgcn_buffer_store_f32x2(float2_t vdata,
48
                                                 int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
49
50
51
52
53
54
                                                 index_t vindex,
                                                 index_t offset,
                                                 bool glc,
                                                 bool slc) __asm("llvm.amdgcn.buffer.store.v2f32");

__device__ void __llvm_amdgcn_buffer_store_f32x4(float4_t vdata,
55
                                                 int32x4_t srsrc,
Chao Liu's avatar
Chao Liu committed
56
57
58
59
60
                                                 index_t vindex,
                                                 index_t offset,
                                                 bool glc,
                                                 bool slc) __asm("llvm.amdgcn.buffer.store.v4f32");

61
__device__ void
Chao Liu's avatar
Chao Liu committed
62
63
64
65
66
__llvm_amdgcn_raw_buffer_store_f16(half_t vdata,
                                   int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
Chao Liu's avatar
Chao Liu committed
67
68

__device__ void
Chao Liu's avatar
Chao Liu committed
69
70
71
72
73
74
75
76
77
78
79
__llvm_amdgcn_raw_buffer_store_bf16(ushort vdata,
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.bf16");

#if CK_USE_AMD_BUFFER_ATOMIC_FADD
#if CK_HIP_VERSION_FLAT >= 3010020405
// starting ROCm-3.10, the return type becomes float
__device__ float
#else
Chao Liu's avatar
Chao Liu committed
80
__device__ void
Chao Liu's avatar
Chao Liu committed
81
#endif
Chao Liu's avatar
Chao Liu committed
82
__llvm_amdgcn_buffer_atomic_add_f32(float vdata,
Chao Liu's avatar
Chao Liu committed
83
                                    int32x4_t rsrc,
Chao Liu's avatar
Chao Liu committed
84
85
86
                                    index_t vindex,
                                    index_t offset,
                                    bool slc) __asm("llvm.amdgcn.buffer.atomic.fadd.f32");
Chao Liu's avatar
Chao Liu committed
87
#endif
88

Chao Liu's avatar
Chao Liu committed
89
// buffer_load requires:
Chao Liu's avatar
Chao Liu committed
90
91
//   1) p_src_wave must be in global memory space
//   2) p_src_wave to be a wavewise pointer.
Chao Liu's avatar
Chao Liu committed
92
// It is user's responsibility to make sure that is true.
93
template <typename T, index_t VectorSize>
Chao Liu's avatar
Chao Liu committed
94
95
96
97
__device__ typename vector_type<T, VectorSize>::type amd_buffer_load(const T* p_src_wave,
                                                                     index_t src_thread_data_offset,
                                                                     bool src_thread_data_valid,
                                                                     index_t src_elemenst_space);
98

Chao Liu's avatar
Chao Liu committed
99
// buffer_store requires:
100
101
//   1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
//   2) p_dst_thread to be a wavewise pointer.
Chao Liu's avatar
Chao Liu committed
102
// It is user's responsibility to make sure that is true.
103
template <typename T, index_t VectorSize>
104
105
__device__ void amd_buffer_store(const T* p_src_thread,
                                 T* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
106
                                 index_t dst_thread_data_offset,
107
108
                                 bool dst_thread_data_valid,
                                 index_t dst_data_range);
109

110
111
112
113
// buffer_atomic requires:
//   1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
//   2) p_dst_thread to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
114
template <typename T, index_t VectorSize>
115
116
__device__ void amd_buffer_atomic_add(const T* p_src_thread,
                                      T* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
117
                                      index_t dst_thread_data_offset,
118
119
                                      bool dst_thread_data_valid,
                                      index_t dst_data_range);
120

121
template <>
122
__device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
Chao Liu's avatar
Chao Liu committed
123
                                           index_t src_thread_data_offset,
124
125
                                           bool src_thread_data_valid,
                                           index_t src_data_range)
126
{
Chao Liu's avatar
Chao Liu committed
127
    BufferResource<float> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
128

129
130
131
132
133
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
134
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
135
136

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
137

Chao Liu's avatar
Chao Liu committed
138
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
139
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
140

Chao Liu's avatar
Chao Liu committed
141
    return __llvm_amdgcn_buffer_load_f32(
142
143
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else
Chao Liu's avatar
Chao Liu committed
144
145
146
147
    float tmp = __llvm_amdgcn_buffer_load_f32(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? tmp : float(0);
148
#endif
Chao Liu's avatar
Chao Liu committed
149
150
151
}

template <>
152
__device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave,
Chao Liu's avatar
Chao Liu committed
153
                                              index_t src_thread_data_offset,
154
155
                                              bool src_thread_data_valid,
                                              index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
156
{
Chao Liu's avatar
Chao Liu committed
157
    BufferResource<float> src_wave_buffer_resource;
158

159
160
161
162
163
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
164
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
165

Chao Liu's avatar
Chao Liu committed
166
    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
167

Chao Liu's avatar
Chao Liu committed
168
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
169
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
170

Chao Liu's avatar
Chao Liu committed
171
    return __llvm_amdgcn_buffer_load_f32x2(
172
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
Chao Liu's avatar
Chao Liu committed
173
174
175
176
177
#else
    float2_t tmp = __llvm_amdgcn_buffer_load_f32x2(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? tmp : float2_t(0);
178
#endif
179
180
181
}

template <>
182
__device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave,
Chao Liu's avatar
Chao Liu committed
183
                                              index_t src_thread_data_offset,
184
185
                                              bool src_thread_data_valid,
                                              index_t src_data_range)
186
{
Chao Liu's avatar
Chao Liu committed
187
    BufferResource<float> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
188

189
190
191
192
193
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
194
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
195
196

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
197

Chao Liu's avatar
Chao Liu committed
198
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
199
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
200

Chao Liu's avatar
Chao Liu committed
201
    return __llvm_amdgcn_buffer_load_f32x4(
202
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
Chao Liu's avatar
Chao Liu committed
203
204
205
206
207
#else
    float4_t tmp = __llvm_amdgcn_buffer_load_f32x4(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? tmp : float4_t(0);
208
#endif
Chao Liu's avatar
Chao Liu committed
209
210
211
}

template <>
212
__device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
Chao Liu's avatar
Chao Liu committed
213
                                             index_t src_thread_data_offset,
214
215
                                             bool src_thread_data_valid,
                                             index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
216
{
Chao Liu's avatar
Chao Liu committed
217
    BufferResource<half_t> src_wave_buffer_resource;
218

219
220
221
222
223
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
224
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
225

Chao Liu's avatar
Chao Liu committed
226
    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
227

Chao Liu's avatar
Chao Liu committed
228
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
229
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
230

Chao Liu's avatar
Chao Liu committed
231
232
233
234
    // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
    // everything is passed to Voffset
    return __llvm_amdgcn_raw_buffer_load_f16(
        src_wave_buffer_resource.data, src_addr_shift + src_thread_addr_offset, 0, 0);
235
#else
Chao Liu's avatar
Chao Liu committed
236
237
238
239
240
241
242
    half_t zero(0);

    // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
    // everything is passed to Voffset
    return src_thread_data_valid ? __llvm_amdgcn_raw_buffer_load_f16(
                                       src_wave_buffer_resource.data, src_thread_addr_offset, 0, 0)
                                 : zero;
243
#endif
Chao Liu's avatar
Chao Liu committed
244
}
245

Chao Liu's avatar
Chao Liu committed
246
template <>
247
__device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave,
Chao Liu's avatar
Chao Liu committed
248
                                              index_t src_thread_data_offset,
249
250
                                              bool src_thread_data_valid,
                                              index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
251
{
Chao Liu's avatar
Chao Liu committed
252
    BufferResource<half_t> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
253

254
255
256
257
258
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
259
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
260
261
262

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);

Chao Liu's avatar
Chao Liu committed
263
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
264
265
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
266
    float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
267
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
Chao Liu's avatar
Chao Liu committed
268
269

    return *reinterpret_cast<half2_t*>(&dst_out_tmp);
Chao Liu's avatar
Chao Liu committed
270
271
272
273
274
275
276
277
#else
    half2_t zeros(0);

    float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? *reinterpret_cast<half2_t*>(&dst_out_tmp) : zeros;
#endif
278
279
280
}

template <>
281
__device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave,
Chao Liu's avatar
Chao Liu committed
282
                                              index_t src_thread_data_offset,
283
284
                                              bool src_thread_data_valid,
                                              index_t src_data_range)
285
{
Chao Liu's avatar
Chao Liu committed
286
    BufferResource<half_t> src_wave_buffer_resource;
287

288
289
290
291
292
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
293
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
294
295

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
296

Chao Liu's avatar
Chao Liu committed
297
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
298
299
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
300
    float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
301
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
Chao Liu's avatar
Chao Liu committed
302
303

    return *reinterpret_cast<half4_t*>(&dst_out_tmp);
Chao Liu's avatar
Chao Liu committed
304
305
306
307
308
309
310
311
#else
    half4_t zeros(0);

    float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? *reinterpret_cast<half4_t*>(&dst_out_tmp) : zeros;
#endif
Chao Liu's avatar
Chao Liu committed
312
313
314
}

template <>
315
__device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
Chao Liu's avatar
Chao Liu committed
316
                                              index_t src_thread_data_offset,
317
318
                                              bool src_thread_data_valid,
                                              index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
319
{
Chao Liu's avatar
Chao Liu committed
320
    BufferResource<half_t> src_wave_buffer_resource;
321

322
323
324
325
326
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
327
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
328
329
330

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);

Chao Liu's avatar
Chao Liu committed
331
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
332
333
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
334
    float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
335
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
Chao Liu's avatar
Chao Liu committed
336
337

    return *reinterpret_cast<half8_t*>(&dst_out_tmp);
Chao Liu's avatar
Chao Liu committed
338
339
340
341
342
343
344
345
#else
    half8_t zeros(0);

    float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? *reinterpret_cast<half8_t*>(&dst_out_tmp) : zeros;
#endif
Chao Liu's avatar
Chao Liu committed
346
347
348
}

template <>
349
__device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
Chao Liu's avatar
Chao Liu committed
350
                                             index_t src_thread_data_offset,
351
352
                                             bool src_thread_data_valid,
                                             index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
353
{
Chao Liu's avatar
Chao Liu committed
354
    BufferResource<ushort> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
355

356
357
358
359
360
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
361
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
362

Chao Liu's avatar
Chao Liu committed
363
    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
364

Chao Liu's avatar
Chao Liu committed
365
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
366
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
367

Chao Liu's avatar
Chao Liu committed
368
369
370
371
    // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
    // everything is passed to Voffset
    return __llvm_amdgcn_raw_buffer_load_bf16(
        src_wave_buffer_resource.data, src_addr_shift + src_thread_addr_offset, 0, 0);
372
#else
Chao Liu's avatar
Chao Liu committed
373
374
375
376
377
378
379
    ushort zero(0);

    // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
    // everything is passed to Voffset
    return src_thread_data_valid ? __llvm_amdgcn_raw_buffer_load_bf16(
                                       src_wave_buffer_resource.data, src_thread_addr_offset, 0, 0)
                                 : zero;
380
#endif
Chao Liu's avatar
Chao Liu committed
381
382
383
}

template <>
384
__device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave,
Chao Liu's avatar
Chao Liu committed
385
                                                index_t src_thread_data_offset,
386
387
                                                bool src_thread_data_valid,
                                                index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
388
{
Chao Liu's avatar
Chao Liu committed
389
    BufferResource<ushort> src_wave_buffer_resource;
390

391
392
393
394
395
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
396
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
397
398
399

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);

Chao Liu's avatar
Chao Liu committed
400
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
401
402
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
403
    float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
404
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
Chao Liu's avatar
Chao Liu committed
405
406

    return *reinterpret_cast<ushort2_t*>(&dst_out_tmp);
Chao Liu's avatar
Chao Liu committed
407
408
409
410
411
412
413
414
#else
    ushort2_t zeros(0);

    float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? *reinterpret_cast<ushort2_t*>(&dst_out_tmp) : zeros;
#endif
415
416
417
}

template <>
418
__device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave,
Chao Liu's avatar
Chao Liu committed
419
                                                index_t src_thread_data_offset,
420
421
                                                bool src_thread_data_valid,
                                                index_t src_data_range)
422
{
Chao Liu's avatar
Chao Liu committed
423
    BufferResource<ushort> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
424

425
426
427
428
429
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
430
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
431
432
433

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);

Chao Liu's avatar
Chao Liu committed
434
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
435
436
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
437
    float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
438
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
Chao Liu's avatar
Chao Liu committed
439
440

    return *reinterpret_cast<ushort4_t*>(&dst_out_tmp);
Chao Liu's avatar
Chao Liu committed
441
442
443
444
445
446
447
448
#else
    ushort4_t zeros(0);

    float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? *reinterpret_cast<ushort4_t*>(&dst_out_tmp) : zeros;
#endif
Chao Liu's avatar
Chao Liu committed
449
450
451
}

template <>
452
__device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
Chao Liu's avatar
Chao Liu committed
453
                                                index_t src_thread_data_offset,
454
455
                                                bool src_thread_data_valid,
                                                index_t src_data_range)
Chao Liu's avatar
Chao Liu committed
456
{
Chao Liu's avatar
Chao Liu committed
457
    BufferResource<ushort> src_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
458

459
460
461
462
463
    // wavewise base address (64 bit)
    src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
    // wavewise range (32 bit)
    src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
464
    src_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
465
466
467

    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);

Chao Liu's avatar
Chao Liu committed
468
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
469
470
    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
471
    float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
472
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
Chao Liu's avatar
Chao Liu committed
473
474

    return *reinterpret_cast<ushort8_t*>(&dst_out_tmp);
Chao Liu's avatar
Chao Liu committed
475
476
477
478
479
480
481
482
#else
    ushort8_t zeros(0);

    float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

    return src_thread_data_valid ? *reinterpret_cast<ushort8_t*>(&dst_out_tmp) : zeros;
#endif
Chao Liu's avatar
Chao Liu committed
483
484
485
}

template <>
486
487
__device__ void amd_buffer_store<float, 1>(const float* p_src_thread,
                                           float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
488
                                           index_t dst_thread_data_offset,
489
490
                                           bool dst_thread_data_valid,
                                           index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
491
{
Chao Liu's avatar
Chao Liu committed
492
    BufferResource<float> dst_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
493

494
495
496
497
498
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
499
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
500

501
502
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

Chao Liu's avatar
Chao Liu committed
503
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
504
505
506
507
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_f32(*p_src_thread,
                                   dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
508
                                   0,
509
                                   dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
510
511
                                   false,
                                   false);
512
513
514
515
516
517
518
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32(
            *p_src_thread, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
    }
#endif
Chao Liu's avatar
Chao Liu committed
519
520
521
}

template <>
522
523
__device__ void amd_buffer_store<float, 2>(const float* p_src_thread,
                                           float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
524
                                           index_t dst_thread_data_offset,
525
526
                                           bool dst_thread_data_valid,
                                           index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
527
{
Chao Liu's avatar
Chao Liu committed
528
    BufferResource<float> dst_wave_buffer_resource;
529

530
531
532
533
534
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
535
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
536

Chao Liu's avatar
Chao Liu committed
537
538
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

Chao Liu's avatar
Chao Liu committed
539
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
540
541
542
543
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
                                     dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
544
                                     0,
545
                                     dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
546
547
                                     false,
                                     false);
Chao Liu's avatar
Chao Liu committed
548
549
550
551
552
553
554
555
556
557
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
                                         dst_wave_buffer_resource.data,
                                         0,
                                         dst_thread_addr_offset,
                                         false,
                                         false);
    }
558
#endif
559
560
561
}

template <>
562
563
__device__ void amd_buffer_store<float, 4>(const float* p_src_thread,
                                           float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
564
                                           index_t dst_thread_data_offset,
565
566
                                           bool dst_thread_data_valid,
                                           index_t dst_data_range)
567
{
Chao Liu's avatar
Chao Liu committed
568
    BufferResource<float> dst_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
569

570
571
572
573
574
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
575
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
576

577
578
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

Chao Liu's avatar
Chao Liu committed
579
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
580
581
582
583
584
585
586
587
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
                                     dst_wave_buffer_resource.data,
                                     0,
                                     dst_addr_shift + dst_thread_addr_offset,
                                     false,
                                     false);
Chao Liu's avatar
Chao Liu committed
588
589
590
591
592
593
594
595
596
597
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
                                         dst_wave_buffer_resource.data,
                                         0,
                                         dst_thread_addr_offset,
                                         false,
                                         false);
    }
598
#endif
Chao Liu's avatar
Chao Liu committed
599
600
601
}

template <>
602
603
__device__ void amd_buffer_store<half_t, 1>(const half_t* p_src_thread,
                                            half_t* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
604
                                            index_t dst_thread_data_offset,
605
606
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
607
{
Chao Liu's avatar
Chao Liu committed
608
    BufferResource<half_t> dst_wave_buffer_resource;
609

610
611
612
613
614
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
615
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
616

Chao Liu's avatar
Chao Liu committed
617
618
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);

Chao Liu's avatar
Chao Liu committed
619
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
620
621
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
622
623
624
625
626
627
628
    // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
    // everything is passed to Voffset
    __llvm_amdgcn_raw_buffer_store_f16(*p_src_thread,
                                       dst_wave_buffer_resource.data,
                                       dst_addr_shift + dst_thread_addr_offset,
                                       0,
                                       0);
629
630
631
#else
    if(dst_thread_data_valid)
    {
Chao Liu's avatar
Chao Liu committed
632
633
634
635
        // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
        // everything is passed to Voffset
        __llvm_amdgcn_raw_buffer_store_f16(
            *p_src_thread, dst_wave_buffer_resource.data, dst_thread_addr_offset, 0, 0);
636
    }
637
638
639
640
#endif
}

template <>
641
642
__device__ void amd_buffer_store<half_t, 2>(const half_t* p_src_thread,
                                            half_t* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
643
                                            index_t dst_thread_data_offset,
644
645
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
646
{
Chao Liu's avatar
Chao Liu committed
647
    BufferResource<half_t> dst_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
648

649
650
651
652
653
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
654
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
655
656

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
657

658
659
    const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);

Chao Liu's avatar
Chao Liu committed
660
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
661
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
662
663

    __llvm_amdgcn_buffer_store_f32(*p_src_tmp,
664
                                   dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
665
                                   0,
666
                                   dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
667
668
                                   false,
                                   false);
Chao Liu's avatar
Chao Liu committed
669
670
671
672
673
674
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32(
            *p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
    }
Chao Liu's avatar
Chao Liu committed
675
676
677
678
#endif
}

template <>
679
680
__device__ void amd_buffer_store<half_t, 4>(const half_t* p_src_thread,
                                            half_t* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
681
                                            index_t dst_thread_data_offset,
682
683
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
684
{
Chao Liu's avatar
Chao Liu committed
685
    BufferResource<half_t> dst_wave_buffer_resource;
686
687
688
689
690
691

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
692
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
693

694
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
695

696
    const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
697

Chao Liu's avatar
Chao Liu committed
698
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
699
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
700
701

    __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
702
                                     dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
703
                                     0,
704
                                     dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
705
706
                                     false,
                                     false);
Chao Liu's avatar
Chao Liu committed
707
708
709
710
711
712
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32x2(
            *p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
    }
713
714
715
#endif
}

716
template <>
717
718
__device__ void amd_buffer_store<half_t, 8>(const half_t* p_src_thread,
                                            half_t* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
719
                                            index_t dst_thread_data_offset,
720
721
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
722
{
Chao Liu's avatar
Chao Liu committed
723
    BufferResource<half_t> dst_wave_buffer_resource;
724
725
726
727
728
729

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
730
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
731
732

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
Chao Liu's avatar
Chao Liu committed
733

734
735
    const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);

Chao Liu's avatar
Chao Liu committed
736
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
737
738
739
740
741
742
743
744
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
                                     dst_wave_buffer_resource.data,
                                     0,
                                     dst_addr_shift + dst_thread_addr_offset,
                                     false,
                                     false);
Chao Liu's avatar
Chao Liu committed
745
746
747
748
749
750
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32x4(
            *p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
    }
751
752
753
754
755
756
757
758
759
760
#endif
}

template <>
__device__ void amd_buffer_store<ushort, 1>(const ushort* p_src_thread,
                                            ushort* p_dst_wave,
                                            index_t dst_thread_data_offset,
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
{
Chao Liu's avatar
Chao Liu committed
761
    BufferResource<ushort> dst_wave_buffer_resource;
762
763
764
765
766
767

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
768
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
769
770
771

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);

Chao Liu's avatar
Chao Liu committed
772
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
773
774
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
775
776
777
778
779
    __llvm_amdgcn_raw_buffer_store_bf16(*p_src_thread,
                                        dst_wave_buffer_resource.data,
                                        dst_addr_shift + dst_thread_addr_offset,
                                        0,
                                        0);
Chao Liu's avatar
Chao Liu committed
780
#else
781
782
    if(dst_thread_data_valid)
    {
Chao Liu's avatar
Chao Liu committed
783
784
        __llvm_amdgcn_raw_buffer_store_bf16(
            *p_src_thread, dst_wave_buffer_resource.data, dst_thread_addr_offset, 0, 0);
785
    }
Chao Liu's avatar
Chao Liu committed
786
787
788
789
#endif
}

template <>
790
791
__device__ void amd_buffer_store<ushort, 2>(const ushort* p_src_thread,
                                            ushort* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
792
                                            index_t dst_thread_data_offset,
793
794
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
795
{
Chao Liu's avatar
Chao Liu committed
796
    BufferResource<ushort> dst_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
797

798
799
800
801
802
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
803
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
804

Chao Liu's avatar
Chao Liu committed
805
806
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);

807
808
    const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);

Chao Liu's avatar
Chao Liu committed
809
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
810
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
811
812

    __llvm_amdgcn_buffer_store_f32(*p_src_tmp,
813
                                   dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
814
                                   0,
815
                                   dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
816
817
                                   false,
                                   false);
Chao Liu's avatar
Chao Liu committed
818
819
820
821
822
823
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32(
            *p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
    }
Chao Liu's avatar
Chao Liu committed
824
825
826
827
#endif
}

template <>
828
829
__device__ void amd_buffer_store<ushort, 4>(const ushort* p_src_thread,
                                            ushort* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
830
                                            index_t dst_thread_data_offset,
831
832
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
833
{
Chao Liu's avatar
Chao Liu committed
834
    BufferResource<ushort> dst_wave_buffer_resource;
835

836
837
838
839
840
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
841
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
842

Chao Liu's avatar
Chao Liu committed
843
844
    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);

845
846
    const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);

Chao Liu's avatar
Chao Liu committed
847
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
848
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
Chao Liu's avatar
Chao Liu committed
849
850

    __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
851
                                     dst_wave_buffer_resource.data,
Chao Liu's avatar
Chao Liu committed
852
                                     0,
853
                                     dst_addr_shift + dst_thread_addr_offset,
Chao Liu's avatar
Chao Liu committed
854
855
                                     false,
                                     false);
Chao Liu's avatar
Chao Liu committed
856
857
858
859
860
861
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32x2(
            *p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
    }
862
863
864
#endif
}

Chao Liu's avatar
Chao Liu committed
865
template <>
866
867
868
869
870
871
__device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
                                            ushort* p_dst_wave,
                                            index_t dst_thread_data_offset,
                                            bool dst_thread_data_valid,
                                            index_t dst_data_range)
{
Chao Liu's avatar
Chao Liu committed
872
    BufferResource<ushort> dst_wave_buffer_resource;
873
874
875
876
877
878

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
879
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
880
881
882
883
884

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);

    const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);

Chao Liu's avatar
Chao Liu committed
885
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
886
887
888
889
890
891
892
893
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
                                     dst_wave_buffer_resource.data,
                                     0,
                                     dst_addr_shift + dst_thread_addr_offset,
                                     false,
                                     false);
Chao Liu's avatar
Chao Liu committed
894
895
896
897
898
899
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_store_f32x4(
            *p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
    }
900
901
902
#endif
}

Chao Liu's avatar
Chao Liu committed
903
#if CK_USE_AMD_BUFFER_ATOMIC_FADD
904
905
906
template <>
__device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
                                                float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
907
                                                index_t dst_thread_data_offset,
908
909
                                                bool dst_thread_data_valid,
                                                index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
910
{
Chao Liu's avatar
Chao Liu committed
911
    BufferResource<float> dst_wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
912

913
914
915
916
917
    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
918
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
919
920
921

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

Chao Liu's avatar
Chao Liu committed
922
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
923
924
925
926
927
928
929
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    __llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread,
                                        dst_wave_buffer_resource.data,
                                        0,
                                        dst_addr_shift + dst_thread_addr_offset,
                                        false);
Chao Liu's avatar
Chao Liu committed
930
931
932
933
934
935
#else
    if(dst_thread_data_valid)
    {
        __llvm_amdgcn_buffer_atomic_add_f32(
            *p_src_thread, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false);
    }
936
#endif
Chao Liu's avatar
Chao Liu committed
937
938
939
}

template <>
940
941
__device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
                                                float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
942
                                                index_t dst_thread_data_offset,
943
944
                                                bool dst_thread_data_valid,
                                                index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
945
{
Chao Liu's avatar
Chao Liu committed
946
    BufferResource<float> dst_wave_buffer_resource;
947
948
949
950
951
952

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range;
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
953
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
954
955
956

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

Chao Liu's avatar
Chao Liu committed
957
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
958
959
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

Chao Liu's avatar
Chao Liu committed
960
961
    for(index_t i = 0; i < 2; ++i)
    {
962
963
964
965
966
967
        __llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
                                            dst_wave_buffer_resource.data,
                                            0,
                                            dst_addr_shift + dst_thread_addr_offset +
                                                i * sizeof(float),
                                            false);
Chao Liu's avatar
Chao Liu committed
968
    }
Chao Liu's avatar
Chao Liu committed
969
970
971
972
973
974
975
976
977
978
979
980
#else
    if(dst_thread_data_valid)
    {
        for(index_t i = 0; i < 2; ++i)
        {
            __llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
                                                dst_wave_buffer_resource.data,
                                                0,
                                                dst_thread_addr_offset + i * sizeof(float),
                                                false);
        }
    }
981
#endif
Chao Liu's avatar
Chao Liu committed
982
983
984
}

template <>
985
986
__device__ void amd_buffer_atomic_add<float, 4>(const float* p_src_thread,
                                                float* p_dst_wave,
Chao Liu's avatar
Chao Liu committed
987
                                                index_t dst_thread_data_offset,
988
989
                                                bool dst_thread_data_valid,
                                                index_t dst_data_range)
Chao Liu's avatar
Chao Liu committed
990
{
Chao Liu's avatar
Chao Liu committed
991
    BufferResource<float> dst_wave_buffer_resource;
992
993
994
995
996
997

    // wavewise base address (64 bit)
    dst_wave_buffer_resource.address[0] = p_dst_wave;
    // wavewise range (32 bit)
    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
    // wavewise setting (32 bit)
Chao Liu's avatar
Chao Liu committed
998
    dst_wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
999
1000
1001

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

Chao Liu's avatar
Chao Liu committed
1002
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

    for(index_t i = 0; i < 4; ++i)
    {
        __llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
                                            dst_wave_buffer_resource.data,
                                            0,
                                            dst_addr_shift + dst_thread_addr_offset +
                                                i * sizeof(float),
                                            false);
    }
Chao Liu's avatar
Chao Liu committed
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
#else
    if(dst_thread_data_valid)
    {
        for(index_t i = 0; i < 4; ++i)
        {
            __llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
                                                dst_wave_buffer_resource.data,
                                                0,
                                                dst_thread_addr_offset + i * sizeof(float),
                                                false);
        }
    }
1026
#endif
Chao Liu's avatar
Chao Liu committed
1027
}
Chao Liu's avatar
Chao Liu committed
1028
#endif // CK_USE_AMD_BUFFER_ATOMIC_FADD
Chao Liu's avatar
Chao Liu committed
1029

1030
1031
} // namespace ck
#endif