"vscode:/vscode.git/clone" did not exist on "3928d48174d928d5fdd1599d262e200480699adb"
amd_inline_asm.hpp 26 KB
Newer Older
1
2
3
#ifndef CK_AMD_INLINE_ASM_HPP
#define CK_AMD_INLINE_ASM_HPP

Chao Liu's avatar
Chao Liu committed
4
#include "vector_type.hpp"
Jing Zhang's avatar
Jing Zhang committed
5

6
7
namespace ck {

Chao Liu's avatar
Chao Liu committed
8
// cast a pointer of LDS to its address
Chao Liu's avatar
Chao Liu committed
9
extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);
Chao Liu's avatar
Chao Liu committed
10

11
// global_load and global_store
12
13
template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType
14
__global_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset);
15
16

template <typename T, index_t VectorSize>
17
18
19
20
__device__ void __global_store(const typename vector_type<T, VectorSize>::MemoryType& src,
                               T* p_dst_block,
                               uint32_t dst_thread_offset,
                               uint32_t dst_const_offset);
21
22

template <>
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
__device__ float __global_load<float, 1>(const float* p_src_block,
                                         uint32_t src_thread_offset,
                                         uint32_t src_const_offset)
{
#if 0 // compute on VALU
    float dst;

    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);

    asm volatile("\n \
     global_load_dword %0, %1, %2, offset:0 \n \
     s_waitcnt 0 \n \
     "
                 : "=v"(dst)
                 : "v"(src_thread_offset_u64), "s"(p_src_block));

    return dst;
#else // compute on SALU
    float dst;

    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset);

    const float* p_src_block_with_offset = p_src_block + src_const_offset;

    asm volatile("\n \
     global_load_dword %0, %1, %2, offset:0 \n \
     ;;s_waitcnt 0 \n \
     "
                 : "=v"(dst)
                 : "v"(src_thread_offset_u64), "s"(p_src_block_with_offset));

    return dst;
#endif
}

template <>
__device__ vector_type<float, 2>::MemoryType __global_load<float, 2>(const float* p_src_block,
                                                                     uint32_t src_thread_offset,
                                                                     uint32_t src_const_offset)
{
#if 0 // compute on VALU
    vector_type<float, 2>::MemoryType dst;

    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);

    asm volatile("\n \
     global_load_dwordx2 %0, %1, %2, offset:0 \n \
     s_waitcnt 0 \n \
     "
                 : "=v"(dst)
                 : "v"(src_thread_offset_u64), "s"(p_src_block));

    return dst;
#else // compute on SALU
    vector_type<float, 2>::MemoryType dst;

    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset);

    const float* p_src_block_with_offset = p_src_block + src_const_offset;

    asm volatile("\n \
     global_load_dwordx2 %0, %1, %2, offset:0 \n \
     ;;s_waitcnt 0 \n \
     "
                 : "=v"(dst)
                 : "v"(src_thread_offset_u64), "s"(p_src_block_with_offset));

    return dst;
#endif
}

template <>
__device__ vector_type<float, 4>::MemoryType __global_load<float, 4>(const float* p_src_block,
                                                                     uint32_t src_thread_offset,
                                                                     uint32_t src_const_offset)
{
#if 0 // compute on VALU
    vector_type<float, 4>::MemoryType dst;

    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);

    asm volatile("\n \
     global_load_dwordx4 %0, %1, %2, offset:0 \n \
     s_waitcnt 0 \n \
     "
                 : "=v"(dst)
                 : "v"(src_thread_offset_u64), "s"(p_src_block));

    return dst;
#else // compute on SALU
    vector_type<float, 4>::MemoryType dst;

    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset);

    const float* p_src_block_with_offset = p_src_block + src_const_offset;

    asm volatile("\n \
     global_load_dwordx4 %0, %1, %2, offset:0 \n \
     ;;s_waitcnt 0 \n \
     "
                 : "=v"(dst)
                 : "v"(src_thread_offset_u64), "s"(p_src_block_with_offset));

    return dst;
#endif
}

template <>
__device__ void __global_store<float, 1>(const float& src,
                                         float* p_dst_block,
                                         uint32_t dst_thread_offset,
                                         uint32_t dst_const_offset)
{
#if 0 // compute on VALU
    uint64_t dst_thread_offset_u64 = static_cast<uint64_t>(dst_thread_offset + dst_const_offset);

    asm volatile("\n \
     global_store_dword %0, %1, %2, offset:0 \n \
     s_waitcnt 0 \n \
     "
                 :
                 : "v"(dst_thread_offset_u64), "v"(src), "s"(p_dst_block));
#else // compute on SALU
    uint64_t dst_thread_offset_u64 = static_cast<uint64_t>(dst_thread_offset);

    float* p_dst_block_with_offset = p_dst_block + dst_const_offset;

    asm volatile("\n \
     global_store_dword %0, %1, %2, offset:0 \n \
     ;;s_waitcnt 0 \n \
     "
                 :
                 : "v"(dst_thread_offset_u64), "v"(src), "s"(p_dst_block_with_offset));
#endif
}

// __buffer_load and __buffer_store
template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType
__buffer_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset);

template <typename T, index_t VectorSize>
__device__ void __buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
                               T* p_dst_block,
                               uint32_t dst_thread_offset,
                               uint32_t dst_const_offset);

template <>
__device__ float __buffer_load<float, 1>(const float* p_src_block,
                                         uint32_t src_thread_offset,
                                         uint32_t src_const_offset)
174
175
176
177
178
179
180
181
182
183
184
185
{
    float dst;

    int32x4_t src_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
    // fill in byte 2
    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;

    asm volatile("\n \
186
187
    buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
    ;;s_waitcnt 0 \n \
188
189
190
191
192
193
194
195
    "
                 : "=v"(dst)
                 : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));

    return dst;
}

template <>
196
197
198
__device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(const float* p_src_block,
                                                                     uint32_t src_thread_offset,
                                                                     uint32_t src_const_offset)
199
200
201
202
203
204
205
206
207
208
209
210
{
    vector_type<float, 2>::MemoryType dst;

    int32x4_t src_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
    // fill in byte 2
    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;

    asm volatile("\n \
211
212
    buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
    ;;s_waitcnt 0 \n \
213
214
215
216
217
218
219
220
    "
                 : "=v"(dst)
                 : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));

    return dst;
}

template <>
221
222
223
__device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(const float* p_src_block,
                                                                     uint32_t src_thread_offset,
                                                                     uint32_t src_const_offset)
224
225
226
227
228
229
230
231
232
233
234
235
{
    vector_type<float, 4>::MemoryType dst;

    int32x4_t src_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
    // fill in byte 2
    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;

    asm volatile("\n \
236
237
    buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
    ;;s_waitcnt 0 \n \
238
239
240
241
242
243
244
245
    "
                 : "=v"(dst)
                 : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));

    return dst;
}

template <>
246
247
248
249
__device__ void __buffer_store<float, 1>(const float& src,
                                         float* p_dst_block,
                                         uint32_t dst_thread_offset,
                                         uint32_t dst_const_offset)
250
251
252
253
254
255
256
257
258
259
{
    int32x4_t dst_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
    // fill in byte 2
    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;

    asm volatile("\n \
260
261
    buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
    ;;s_waitcnt 0 \n \
262
263
264
265
266
    "
                 :
                 : "s"(dst_block_setting), "v"(src), "v"(dst_thread_offset), "s"(dst_const_offset));
}

Chao Liu's avatar
Chao Liu committed
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
__device__ void vmcnt(index_t cnt)
{
    if(cnt == 0)
    {
        asm volatile("\n \
                s_waitcnt vmcnt(0) \n \
                " ::);
    }
    else if(cnt == 1)
    {
        asm volatile("\n \
                s_waitcnt vmcnt(1) \n \
                " ::);
    }
    else if(cnt == 2)
    {
        asm volatile("\n \
                s_waitcnt vmcnt(2) \n \
                " ::);
    }
    else if(cnt == 4)
    {
        asm volatile("\n \
                s_waitcnt vmcnt(2) \n \
                " ::);
    }
    else
    {
        assert(false);
    }
}

__device__ void lgkmcnt(index_t cnt)
{
    if(cnt == 0)
    {
        asm volatile("\n \
                s_waitcnt lgkmcnt(0) \n \
                " ::);
    }
    else if(cnt == 1)
    {
        asm volatile("\n \
                s_waitcnt lgkmcnt(1) \n \
                " ::);
    }
    else if(cnt == 2)
    {
        asm volatile("\n \
                s_waitcnt lgkmcnt(2) \n \
                " ::);
    }
    else if(cnt == 3)
    {
        asm volatile("\n \
                s_waitcnt lgkmcnt(3) \n \
                " ::);
    }
    else if(cnt == 4)
    {
        asm volatile("\n \
                s_waitcnt lgkmcnt(4) \n \
                " ::);
    }
    else
    {
        assert(false);
    }
}

Chao Liu's avatar
Chao Liu committed
337
__device__ void outerProduct1x4(const float* a, const float* b, float* c)
Chao Liu's avatar
Chao Liu committed
338
{
Jing Zhang's avatar
Jing Zhang committed
339
340
341
342
343
344
    asm volatile("\n \
            v_mac_f32 %0, %4, %5 \n \
            v_mac_f32 %1, %4, %6 \n \
            v_mac_f32 %2, %4, %7 \n \
            v_mac_f32 %3, %4, %8 \n \
            "
Chao Liu's avatar
Chao Liu committed
345
346
347
348
349
350
351
352
353
354
                 : "=v"(c[0]), "=v"(c[1]), "=v"(c[2]), "=v"(c[3])
                 : "v"(a[0]),
                   "v"(b[0]),
                   "v"(b[1]),
                   "v"(b[2]),
                   "v"(b[3]),
                   "0"(c[0]),
                   "1"(c[1]),
                   "2"(c[2]),
                   "3"(c[3]));
Jing Zhang's avatar
Jing Zhang committed
355
356
}

Chao Liu's avatar
Chao Liu committed
357
358
359
__device__ void outerProduct1x4(const float& a,
                                const vector_type<float, 4>::MemoryType& b,
                                vector_type<float, 4>::MemoryType& c)
Chao Liu's avatar
Chao Liu committed
360
{
Chao Liu's avatar
Chao Liu committed
361
    outerProduct1x4(&a, reinterpret_cast<const float*>(&b), reinterpret_cast<float*>(&c));
Jing Zhang's avatar
Jing Zhang committed
362
363
}

Chao Liu's avatar
Chao Liu committed
364
365
366
367
368
369
370
371
372
__device__ void outerProduct2x4(const vector_type<float, 2>::MemoryType& a,
                                const vector_type<float, 4>::MemoryType& b,
                                vector_type<float, 4>::MemoryType& c0,
                                vector_type<float, 4>::MemoryType& c1)
{
    outerProduct1x4(a.x, b, c0);
    outerProduct1x4(a.y, b, c1);
}

Chao Liu's avatar
Chao Liu committed
373
374
375
376
377
378
__device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a,
                                const vector_type<float, 4>::MemoryType& b,
                                vector_type<float, 4>::MemoryType& c0,
                                vector_type<float, 4>::MemoryType& c1,
                                vector_type<float, 4>::MemoryType& c2,
                                vector_type<float, 4>::MemoryType& c3)
Chao Liu's avatar
Chao Liu committed
379
{
Jing Zhang's avatar
Jing Zhang committed
380
381
382
383
384
385
    outerProduct1x4(a.x, b, c0);
    outerProduct1x4(a.y, b, c1);
    outerProduct1x4(a.z, b, c2);
    outerProduct1x4(a.w, b, c3);
}

Chao Liu's avatar
Chao Liu committed
386
387
388
__device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
                                const vector_type<float, 4>::MemoryType* b,
                                vector_type<float, 4>::MemoryType* c)
Jing Zhang's avatar
Jing Zhang committed
389
390
391
392
393
394
395
{
    outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]);
    outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]);
    outerProduct4x4(a[1], b[0], c[8], c[10], c[12], c[14]);
    outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]);
}

Chao Liu's avatar
Chao Liu committed
396
__device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
Jing Zhang's avatar
Jing Zhang committed
397
398
399
400
{
    if(offset == 0)
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
401
                ds_read_b128 %0, %1 offset:0\n \
Jing Zhang's avatar
Jing Zhang committed
402
                "
Chao Liu's avatar
Chao Liu committed
403
404
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
405
    }
Jing Zhang's avatar
Jing Zhang committed
406
    if(offset == 64)
Jing Zhang's avatar
Jing Zhang committed
407
408
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
409
                ds_read_b128 %0, %1 offset:64\n \
Jing Zhang's avatar
Jing Zhang committed
410
                "
Chao Liu's avatar
Chao Liu committed
411
412
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
413
    }
Jing Zhang's avatar
Jing Zhang committed
414
    if(offset == 128)
Jing Zhang's avatar
Jing Zhang committed
415
416
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
417
                ds_read_b128 %0, %1 offset:128\n \
Jing Zhang's avatar
Jing Zhang committed
418
                "
Chao Liu's avatar
Chao Liu committed
419
420
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
421
    }
Jing Zhang's avatar
Jing Zhang committed
422
    if(offset == 192)
Jing Zhang's avatar
Jing Zhang committed
423
424
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
425
                ds_read_b128 %0, %1 offset:192\n \
Jing Zhang's avatar
Jing Zhang committed
426
                "
Chao Liu's avatar
Chao Liu committed
427
428
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
429
    }
Jing Zhang's avatar
Jing Zhang committed
430
    if(offset == 256)
Jing Zhang's avatar
Jing Zhang committed
431
432
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
433
                ds_read_b128 %0, %1 offset:256\n \
Jing Zhang's avatar
Jing Zhang committed
434
                "
Chao Liu's avatar
Chao Liu committed
435
436
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
437
    }
Jing Zhang's avatar
Jing Zhang committed
438
    if(offset == 320)
Jing Zhang's avatar
Jing Zhang committed
439
440
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
441
                ds_read_b128 %0, %1 offset:320\n \
Jing Zhang's avatar
Jing Zhang committed
442
                "
Chao Liu's avatar
Chao Liu committed
443
444
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
445
    }
Jing Zhang's avatar
Jing Zhang committed
446
    if(offset == 384)
Jing Zhang's avatar
Jing Zhang committed
447
448
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
449
                ds_read_b128 %0, %1 offset:384\n \
Jing Zhang's avatar
Jing Zhang committed
450
                "
Chao Liu's avatar
Chao Liu committed
451
452
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
453
    }
Jing Zhang's avatar
Jing Zhang committed
454
    if(offset == 448)
Jing Zhang's avatar
Jing Zhang committed
455
456
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
457
                ds_read_b128 %0, %1 offset:448\n \
Jing Zhang's avatar
Jing Zhang committed
458
                "
Chao Liu's avatar
Chao Liu committed
459
460
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
461
    }
Jing Zhang's avatar
Jing Zhang committed
462
    if(offset == 512)
Jing Zhang's avatar
Jing Zhang committed
463
464
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
465
                ds_read_b128 %0, %1 offset:512\n \
Jing Zhang's avatar
Jing Zhang committed
466
                "
Chao Liu's avatar
Chao Liu committed
467
468
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
469
    }
Jing Zhang's avatar
Jing Zhang committed
470
    if(offset == 576)
Jing Zhang's avatar
Jing Zhang committed
471
472
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
473
                ds_read_b128 %0, %1 offset:576\n \
Jing Zhang's avatar
Jing Zhang committed
474
                "
Chao Liu's avatar
Chao Liu committed
475
476
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
477
    }
Jing Zhang's avatar
Jing Zhang committed
478
    if(offset == 640)
Jing Zhang's avatar
Jing Zhang committed
479
480
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
481
                ds_read_b128 %0, %1 offset:640\n \
Jing Zhang's avatar
Jing Zhang committed
482
                "
Chao Liu's avatar
Chao Liu committed
483
484
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
485
    }
Jing Zhang's avatar
Jing Zhang committed
486
    if(offset == 704)
Jing Zhang's avatar
Jing Zhang committed
487
488
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
489
                ds_read_b128 %0, %1 offset:704\n \
Jing Zhang's avatar
Jing Zhang committed
490
                "
Chao Liu's avatar
Chao Liu committed
491
492
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
493
    }
Jing Zhang's avatar
Jing Zhang committed
494
    if(offset == 768)
Jing Zhang's avatar
Jing Zhang committed
495
496
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
497
                ds_read_b128 %0, %1 offset:768\n \
Jing Zhang's avatar
Jing Zhang committed
498
                "
Chao Liu's avatar
Chao Liu committed
499
500
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
501
    }
Jing Zhang's avatar
Jing Zhang committed
502
    if(offset == 832)
Jing Zhang's avatar
Jing Zhang committed
503
504
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
505
                ds_read_b128 %0, %1 offset:832\n \
Jing Zhang's avatar
Jing Zhang committed
506
                "
Chao Liu's avatar
Chao Liu committed
507
508
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
509
    }
Jing Zhang's avatar
Jing Zhang committed
510
    if(offset == 896)
Jing Zhang's avatar
Jing Zhang committed
511
512
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
513
                ds_read_b128 %0, %1 offset:896\n \
Jing Zhang's avatar
Jing Zhang committed
514
                "
Chao Liu's avatar
Chao Liu committed
515
516
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
517
    }
Jing Zhang's avatar
Jing Zhang committed
518
    if(offset == 960)
Jing Zhang's avatar
Jing Zhang committed
519
520
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
521
                ds_read_b128 %0, %1 offset:960\n \
Jing Zhang's avatar
Jing Zhang committed
522
                "
Chao Liu's avatar
Chao Liu committed
523
524
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
525
    }
Jing Zhang's avatar
Jing Zhang committed
526
    if(offset == 1024)
Jing Zhang's avatar
Jing Zhang committed
527
528
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
529
                ds_read_b128 %0, %1 offset:1024\n \
Jing Zhang's avatar
Jing Zhang committed
530
                "
Chao Liu's avatar
Chao Liu committed
531
532
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
533
    }
Jing Zhang's avatar
Jing Zhang committed
534
    if(offset == 1088)
Jing Zhang's avatar
Jing Zhang committed
535
536
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
537
                ds_read_b128 %0, %1 offset:1088\n \
Jing Zhang's avatar
Jing Zhang committed
538
                "
Chao Liu's avatar
Chao Liu committed
539
540
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
541
    }
Jing Zhang's avatar
Jing Zhang committed
542
    if(offset == 1152)
Jing Zhang's avatar
Jing Zhang committed
543
544
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
545
                ds_read_b128 %0, %1 offset:1152\n \
Jing Zhang's avatar
Jing Zhang committed
546
                "
Chao Liu's avatar
Chao Liu committed
547
548
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
549
    }
Jing Zhang's avatar
Jing Zhang committed
550
    if(offset == 1216)
Chao Liu's avatar
Chao Liu committed
551
552
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
553
                ds_read_b128 %0, %1 offset:1216\n \
Chao Liu's avatar
Chao Liu committed
554
                "
Chao Liu's avatar
Chao Liu committed
555
556
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
557
    }
Jing Zhang's avatar
Jing Zhang committed
558
    if(offset == 1280)
Jing Zhang's avatar
Jing Zhang committed
559
560
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
561
                ds_read_b128 %0, %1 offset:1280\n \
Jing Zhang's avatar
Jing Zhang committed
562
                "
Chao Liu's avatar
Chao Liu committed
563
564
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
565
    }
Jing Zhang's avatar
Jing Zhang committed
566
    if(offset == 1344)
Chao Liu's avatar
Chao Liu committed
567
568
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
569
                ds_read_b128 %0, %1 offset:1344\n \
Chao Liu's avatar
Chao Liu committed
570
                "
Chao Liu's avatar
Chao Liu committed
571
572
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
573
    }
Jing Zhang's avatar
Jing Zhang committed
574
    if(offset == 1408)
Jing Zhang's avatar
Jing Zhang committed
575
576
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
577
                ds_read_b128 %0, %1 offset:1408\n \
Jing Zhang's avatar
Jing Zhang committed
578
                "
Chao Liu's avatar
Chao Liu committed
579
580
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
581
    }
Jing Zhang's avatar
Jing Zhang committed
582
    if(offset == 1472)
Chao Liu's avatar
Chao Liu committed
583
584
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
585
                ds_read_b128 %0, %1 offset:1472\n \
Chao Liu's avatar
Chao Liu committed
586
                "
Chao Liu's avatar
Chao Liu committed
587
588
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
589
    }
Jing Zhang's avatar
Jing Zhang committed
590
    if(offset == 1536)
Jing Zhang's avatar
Jing Zhang committed
591
592
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
593
                ds_read_b128 %0, %1 offset:1536\n \
Jing Zhang's avatar
Jing Zhang committed
594
                "
Chao Liu's avatar
Chao Liu committed
595
596
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
597
    }
Jing Zhang's avatar
Jing Zhang committed
598
    if(offset == 1600)
Chao Liu's avatar
Chao Liu committed
599
600
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
601
                ds_read_b128 %0, %1 offset:1600\n \
Chao Liu's avatar
Chao Liu committed
602
                "
Chao Liu's avatar
Chao Liu committed
603
604
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
605
    }
Jing Zhang's avatar
Jing Zhang committed
606
    if(offset == 1664)
Jing Zhang's avatar
Jing Zhang committed
607
608
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
609
                ds_read_b128 %0, %1 offset:1664\n \
Jing Zhang's avatar
Jing Zhang committed
610
                "
Chao Liu's avatar
Chao Liu committed
611
612
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
613
    }
Jing Zhang's avatar
Jing Zhang committed
614
    if(offset == 1728)
Chao Liu's avatar
Chao Liu committed
615
616
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
617
                ds_read_b128 %0, %1 offset:1728\n \
Chao Liu's avatar
Chao Liu committed
618
                "
Chao Liu's avatar
Chao Liu committed
619
620
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
621
    }
Jing Zhang's avatar
Jing Zhang committed
622
    if(offset == 1792)
Jing Zhang's avatar
Jing Zhang committed
623
624
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
625
                ds_read_b128 %0, %1 offset:1792\n \
Jing Zhang's avatar
Jing Zhang committed
626
                "
Chao Liu's avatar
Chao Liu committed
627
628
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
629
    }
Jing Zhang's avatar
Jing Zhang committed
630
    if(offset == 1856)
Chao Liu's avatar
Chao Liu committed
631
632
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
633
                ds_read_b128 %0, %1 offset:1856\n \
Chao Liu's avatar
Chao Liu committed
634
                "
Chao Liu's avatar
Chao Liu committed
635
636
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
637
    }
Jing Zhang's avatar
Jing Zhang committed
638
    if(offset == 1920)
Jing Zhang's avatar
Jing Zhang committed
639
640
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
641
                ds_read_b128 %0, %1 offset:1920\n \
Jing Zhang's avatar
Jing Zhang committed
642
                "
Chao Liu's avatar
Chao Liu committed
643
644
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
645
    }
Jing Zhang's avatar
Jing Zhang committed
646
    if(offset == 1984)
Chao Liu's avatar
Chao Liu committed
647
648
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
649
                ds_read_b128 %0, %1 offset:1984\n \
Chao Liu's avatar
Chao Liu committed
650
                "
Chao Liu's avatar
Chao Liu committed
651
652
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
653
    }
Jing Zhang's avatar
Jing Zhang committed
654
    if(offset == 2048)
Jing Zhang's avatar
Jing Zhang committed
655
656
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
657
                ds_read_b128 %0, %1 offset:2048\n \
Jing Zhang's avatar
Jing Zhang committed
658
                "
Chao Liu's avatar
Chao Liu committed
659
660
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
661
    }
Jing Zhang's avatar
Jing Zhang committed
662
    if(offset == 2112)
Jing Zhang's avatar
Jing Zhang committed
663
664
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
665
                ds_read_b128 %0, %1 offset:2112\n \
Jing Zhang's avatar
Jing Zhang committed
666
                "
Chao Liu's avatar
Chao Liu committed
667
668
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
669
    }
Jing Zhang's avatar
Jing Zhang committed
670
    if(offset == 2176)
Jing Zhang's avatar
Jing Zhang committed
671
    {
Jing Zhang's avatar
Jing Zhang committed
672
673
674
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2176\n \
                "
Chao Liu's avatar
Chao Liu committed
675
676
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
677
678
679
680
681
682
    }
    if(offset == 2240)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2240\n \
                "
Chao Liu's avatar
Chao Liu committed
683
684
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
685
686
687
688
689
690
    }
    if(offset == 2304)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2304\n \
                "
Chao Liu's avatar
Chao Liu committed
691
692
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
693
694
695
696
697
698
    }
    if(offset == 2368)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2368\n \
                "
Chao Liu's avatar
Chao Liu committed
699
700
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
701
702
703
704
705
706
    }
    if(offset == 2432)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2432\n \
                "
Chao Liu's avatar
Chao Liu committed
707
708
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
709
710
711
712
713
714
    }
    if(offset == 2496)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2496\n \
                "
Chao Liu's avatar
Chao Liu committed
715
716
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
717
718
719
720
721
722
    }
    if(offset == 2560)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2560\n \
                "
Chao Liu's avatar
Chao Liu committed
723
724
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
725
726
727
728
729
730
    }
    if(offset == 2624)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2624\n \
                "
Chao Liu's avatar
Chao Liu committed
731
732
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
733
734
735
736
737
738
    }
    if(offset == 2688)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2688\n \
                "
Chao Liu's avatar
Chao Liu committed
739
740
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
741
742
743
744
745
746
    }
    if(offset == 2752)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2752\n \
                "
Chao Liu's avatar
Chao Liu committed
747
748
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
749
750
751
752
753
754
    }
    if(offset == 2816)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2816\n \
                "
Chao Liu's avatar
Chao Liu committed
755
756
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
757
758
759
760
761
762
    }
    if(offset == 2880)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2880\n \
                "
Chao Liu's avatar
Chao Liu committed
763
764
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
765
766
767
768
769
770
    }
    if(offset == 2944)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2944\n \
                "
Chao Liu's avatar
Chao Liu committed
771
772
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
773
774
775
776
777
778
    }
    if(offset == 3008)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3008\n \
                "
Chao Liu's avatar
Chao Liu committed
779
780
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
781
782
783
784
785
786
    }
    if(offset == 3072)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3072\n \
                "
Chao Liu's avatar
Chao Liu committed
787
788
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
789
790
791
792
793
794
    }
    if(offset == 3136)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3136\n \
                "
Chao Liu's avatar
Chao Liu committed
795
796
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
797
798
799
800
801
802
    }
    if(offset == 3200)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3200\n \
                "
Chao Liu's avatar
Chao Liu committed
803
804
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
805
806
807
808
809
810
    }
    if(offset == 3264)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3264\n \
                "
Chao Liu's avatar
Chao Liu committed
811
812
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
813
814
815
816
817
818
    }
    if(offset == 3328)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3328\n \
                "
Chao Liu's avatar
Chao Liu committed
819
820
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
821
822
823
824
825
826
    }
    if(offset == 3392)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3392\n \
                "
Chao Liu's avatar
Chao Liu committed
827
828
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
829
830
831
832
833
834
    }
    if(offset == 3456)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3456\n \
                "
Chao Liu's avatar
Chao Liu committed
835
836
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
837
838
839
840
841
842
    }
    if(offset == 3520)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3520\n \
                "
Chao Liu's avatar
Chao Liu committed
843
844
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
845
846
847
848
849
850
    }
    if(offset == 3584)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3584\n \
                "
Chao Liu's avatar
Chao Liu committed
851
852
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
853
854
855
856
857
858
    }
    if(offset == 3648)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3648\n \
                "
Chao Liu's avatar
Chao Liu committed
859
860
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
861
862
863
864
865
866
    }
    if(offset == 3712)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3712\n \
                "
Chao Liu's avatar
Chao Liu committed
867
868
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
869
870
871
872
873
874
    }
    if(offset == 3776)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3776\n \
                "
Chao Liu's avatar
Chao Liu committed
875
876
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
877
878
879
880
881
882
    }
    if(offset == 3840)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3840\n \
                "
Chao Liu's avatar
Chao Liu committed
883
884
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
885
886
887
888
889
890
    }
    if(offset == 3904)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3904\n \
                "
Chao Liu's avatar
Chao Liu committed
891
892
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
893
894
895
896
897
898
    }
    if(offset == 3968)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3968\n \
                "
Chao Liu's avatar
Chao Liu committed
899
900
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
901
902
903
904
905
906
    }
    if(offset == 4032)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:4032\n \
                "
Chao Liu's avatar
Chao Liu committed
907
908
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
909
910
911
912
913
914
    }
    if(offset == 4096)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:4096\n \
                "
Chao Liu's avatar
Chao Liu committed
915
916
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
917
    }
Jing Zhang's avatar
Jing Zhang committed
918
919
}

Chao Liu's avatar
Chao Liu committed
920
921
__device__ void
ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
Jing Zhang's avatar
Jing Zhang committed
922
{
923
924
925
    if(offset == 0)
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
926
927
            ds_write_b128 %0, %1 \n \
            "
928
929
930
931
932
933
934
                     :
                     : "v"(__to_local(lds)), "v"(r));
    }
    else
    {
        assert(false);
    }
Jing Zhang's avatar
Jing Zhang committed
935
}
936
937
938

} // namespace ck
#endif