amd_inline_asm.hpp 35.6 KB
Newer Older
1
2
3
#ifndef CK_AMD_INLINE_ASM_HPP
#define CK_AMD_INLINE_ASM_HPP

Chao Liu's avatar
Chao Liu committed
4
#include "vector_type.hpp"
Jing Zhang's avatar
Jing Zhang committed
5

6
7
namespace ck {

Chao Liu's avatar
Chao Liu committed
8
// cast a pointer of LDS to its address
Chao Liu's avatar
Chao Liu committed
9
extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);
Chao Liu's avatar
Chao Liu committed
10

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
__device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
                                           uint32_t vindex,
                                           uint32_t offset,
                                           bool glc,
                                           bool slc) __asm("llvm.amdgcn.buffer.load");

__device__ vector_type<float, 2>::MemoryType
__llvm_amdgcn_buffer_loadx2(int32x4_t rsrc,
                            uint32_t vindex,
                            uint32_t offset,
                            bool glc,
                            bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2");

__device__ vector_type<float, 4>::MemoryType
__llvm_amdgcn_buffer_loadx4(int32x4_t rsrc,
                            uint32_t vindex,
                            uint32_t offset,
                            bool glc,
                            bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4");

__device__ void __llvm_amdgcn_buffer_store(float vdata,
                                           int32x4_t rsrc,
                                           uint32_t vindex,
                                           uint32_t offset,
                                           bool glc,
                                           bool slc) __asm("llvm.amdgcn.buffer.store");

__device__ void __llvm_amdgcn_buffer_storex2(vector_type<float, 2>::MemoryType vdata,
                                             int32x4_t rsrc,
                                             uint32_t vindex,
                                             uint32_t offset,
                                             bool glc,
                                             bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2");

__device__ void __llvm_amdgcn_buffer_storex4(vector_type<float, 4>::MemoryType vdata,
                                             int32x4_t rsrc,
                                             uint32_t vindex,
                                             uint32_t offset,
                                             bool glc,
                                             bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4");

52
// global_load and global_store
53
template <typename T, index_t VectorSize>
54
55
__device__ typename vector_type<T, VectorSize>::MemoryType __global_load(
    const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset);
56
57

template <typename T, index_t VectorSize>
58
59
__device__ void __global_store(const typename vector_type<T, VectorSize>::MemoryType& src,
                               T* p_dst_block,
60
61
                               uint32_t dst_thread_data_offset,
                               uint32_t dst_const_data_offset);
62
63

template <>
64
__device__ float __global_load<float, 1>(const float* p_src_block,
65
66
                                         uint32_t src_thread_data_offset,
                                         uint32_t src_const_data_offset)
67
68
69
{
    float dst;

70
71
72
73
74
#if 0   // source code
    dst = p_src_block[src_const_data_offset + src_thread_data_offset];
#elif 0 // use VGPR only
    const float* src_thread_addr_offset_u64 =
        p_src_block + src_const_data_offset + src_thread_data_offset;
75
76

    asm volatile("\n \
77
     global_load_dword %0, %1 off offset:0 \n \
78
79
80
     s_waitcnt 0 \n \
     "
                 : "=v"(dst)
81
82
83
84
                 : "v"(src_thread_addr_offset_u64));
#elif 0 // use VGPR and SGPR, do compute on VALU
    uint64_t src_thread_addr_offset_u64 =
        (src_thread_data_offset + src_const_data_offset) * sizeof(float);
85

86
87
88
89
90
91
92
93
94
    asm volatile("\n \
     global_load_dword %0, %1, %2, offset:0 \n \
     s_waitcnt 0 \n \
     "
                 : "=v"(dst)
                 : "v"(src_thread_addr_offset_u64), "s"(p_src_block));
#elif 1 // use VGPR and SGPR, do compute on SALU
    uint64_t src_thread_addr_offset_u64 =
        static_cast<uint64_t>(src_thread_data_offset * sizeof(float));
95

96
    const float* p_src_block_with_offset = p_src_block + src_const_data_offset;
97
98
99

    asm volatile("\n \
     global_load_dword %0, %1, %2, offset:0 \n \
100
     s_waitcnt 0 \n \
101
102
     "
                 : "=v"(dst)
103
104
                 : "v"(src_thread_addr_offset_u64), "s"(p_src_block_with_offset));
#endif
105
106
107
108
109

    return dst;
}

template <>
110
111
__device__ vector_type<float, 2>::MemoryType __global_load<float, 2>(
    const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
112
{
113
114
115
    using vector_t = vector_type<float, 2>::MemoryType;

    vector_t dst;
116

117
118
119
120
121
#if 0   // source code
    dst = *reinterpret_cast<const vector_t*>(&p_src_block[src_const_data_offset + src_thread_data_offset]);
#elif 0 // use VGPR only
    const float* src_thread_addr_offset_u64 =
        p_src_block + src_const_data_offset + src_thread_data_offset;
122
123

    asm volatile("\n \
124
     global_load_dwordx2 %0, %1 off offset:0 \n \
125
126
127
     s_waitcnt 0 \n \
     "
                 : "=v"(dst)
128
129
130
131
                 : "v"(src_thread_addr_offset_u64));
#elif 0 // use VGPR and SGPR, do compute on VALU
    uint64_t src_thread_addr_offset_u64 =
        (src_thread_data_offset + src_const_data_offset) * sizeof(float);
132

133
134
135
136
137
138
139
140
141
    asm volatile("\n \
     global_load_dwordx2 %0, %1, %2, offset:0 \n \
     s_waitcnt 0 \n \
     "
                 : "=v"(dst)
                 : "v"(src_thread_addr_offset_u64), "s"(p_src_block));
#elif 1 // use VGPR and SGPR, do compute on SALU
    uint64_t src_thread_addr_offset_u64 =
        static_cast<uint64_t>(src_thread_data_offset * sizeof(float));
142

143
    const float* p_src_block_with_offset = p_src_block + src_const_data_offset;
144
145
146

    asm volatile("\n \
     global_load_dwordx2 %0, %1, %2, offset:0 \n \
147
     s_waitcnt 0 \n \
148
149
     "
                 : "=v"(dst)
150
151
                 : "v"(src_thread_addr_offset_u64), "s"(p_src_block_with_offset));
#endif
152
153
154
155
156

    return dst;
}

template <>
157
158
__device__ vector_type<float, 4>::MemoryType __global_load<float, 4>(
    const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
159
{
160
161
162
    using vector_t = vector_type<float, 4>::MemoryType;

    vector_t dst;
163

164
165
166
167
168
#if 0   // source code
    dst = *reinterpret_cast<const vector_t*>(&p_src_block[src_const_data_offset + src_thread_data_offset]);
#elif 0 // use VGPR only
    const float* src_thread_addr_offset_u64 =
        p_src_block + src_const_data_offset + src_thread_data_offset;
169
170

    asm volatile("\n \
171
     global_load_dwordx4 %0, %1 off offset:0 \n \
172
173
174
     s_waitcnt 0 \n \
     "
                 : "=v"(dst)
175
176
177
178
                 : "v"(src_thread_addr_offset_u64));
#elif 0 // use VGPR and SGPR, do compute on VALU
    uint64_t src_thread_addr_offset_u64 =
        (src_thread_data_offset + src_const_data_offset) * sizeof(float);
179

180
181
182
183
184
185
186
187
188
    asm volatile("\n \
     global_load_dwordx4 %0, %1, %2, offset:0 \n \
     s_waitcnt 0 \n \
     "
                 : "=v"(dst)
                 : "v"(src_thread_addr_offset_u64), "s"(p_src_block));
#elif 1 // use VGPR and SGPR, do compute on SALU
    uint64_t src_thread_addr_offset_u64 =
        static_cast<uint64_t>(src_thread_data_offset * sizeof(float));
189

190
    const float* p_src_block_with_offset = p_src_block + src_const_data_offset;
191
192
193

    asm volatile("\n \
     global_load_dwordx4 %0, %1, %2, offset:0 \n \
194
     s_waitcnt 0 \n \
195
196
     "
                 : "=v"(dst)
197
198
                 : "v"(src_thread_addr_offset_u64), "s"(p_src_block_with_offset));
#endif
199
200
201
202
203
204
205

    return dst;
}

template <>
__device__ void __global_store<float, 1>(const float& src,
                                         float* p_dst_block,
206
207
                                         uint32_t dst_thread_data_offset,
                                         uint32_t dst_const_data_offset)
208
209
{
#if 0 // compute on VALU
210
    uint64_t dst_thread_data_offset_u64 = (dst_thread_data_offset + dst_const_data_offset) * sizeof(float);
211
212
213
214
215

    asm volatile("\n \
     global_store_dword %0, %1, %2, offset:0 \n \
     "
                 :
216
                 : "v"(dst_thread_data_offset_u64), "v"(src), "s"(p_dst_block));
217
#else // compute on SALU
218
    uint64_t dst_thread_data_offset_u64 = dst_thread_data_offset * sizeof(float);
219

220
    float* p_dst_block_with_offset = p_dst_block + dst_const_data_offset;
221
222
223
224
225

    asm volatile("\n \
     global_store_dword %0, %1, %2, offset:0 \n \
     "
                 :
226
                 : "v"(dst_thread_data_offset_u64), "v"(src), "s"(p_dst_block_with_offset));
227
228
229
#endif
}

230
// buffer_load and buffer_store
231
template <typename T, index_t VectorSize>
232
233
__device__ typename vector_type<T, VectorSize>::MemoryType __buffer_load(
    const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset);
234
235
236
237

template <typename T, index_t VectorSize>
__device__ void __buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
                               T* p_dst_block,
238
239
                               uint32_t dst_thread_data_offset,
                               uint32_t dst_const_data_offset);
240
241
242

template <>
__device__ float __buffer_load<float, 1>(const float* p_src_block,
243
244
                                         uint32_t src_thread_data_offset,
                                         uint32_t src_const_data_offset)
245
{
246
#if 0
247
248
    float dst;

249
250
251
    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);

252
253
254
255
256
257
258
259
260
    int32x4_t src_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
    // fill in byte 2
    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;

    asm volatile("\n \
261
    buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
262
    s_waitcnt 0 \n \
263
264
    "
                 : "=v"(dst)
265
                 : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
266
267

    return dst;
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#else
    float dst;

    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);

    int32x4_t src_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
    // fill in byte 2
    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;

    dst = __llvm_amdgcn_buffer_load(
        src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);

    return dst;
#endif
287
288
289
}

template <>
290
291
__device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(
    const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
292
{
293
#if 0
294
295
    vector_type<float, 2>::MemoryType dst;

296
297
298
    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);

299
300
301
302
303
304
305
306
307
    int32x4_t src_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
    // fill in byte 2
    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;

    asm volatile("\n \
308
    buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
309
    s_waitcnt 0 \n \
310
311
    "
                 : "=v"(dst)
312
                 : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
313
314

    return dst;
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
#else
    vector_type<float, 2>::MemoryType dst;

    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);

    int32x4_t src_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
    // fill in byte 2
    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;

    dst = __llvm_amdgcn_buffer_loadx2(
        src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);

    return dst;
#endif
334
335
336
}

template <>
337
338
__device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(
    const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
339
{
340
#if 0
341
342
    vector_type<float, 4>::MemoryType dst;

343
344
345
    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);

346
347
348
349
350
351
352
353
354
    int32x4_t src_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
    // fill in byte 2
    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;

    asm volatile("\n \
355
    buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
356
    s_waitcnt 0 \n \
357
358
    "
                 : "=v"(dst)
359
                 : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
360
361

    return dst;
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
#elif 1
    vector_type<float, 4>::MemoryType dst;

    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);

    int32x4_t src_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
    // fill in byte 2
    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;

    dst = __llvm_amdgcn_buffer_loadx4(
        src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);

    return dst;
#endif
381
382
383
}

template <>
384
385
__device__ void __buffer_store<float, 1>(const float& src,
                                         float* p_dst_block,
386
387
                                         uint32_t dst_thread_data_offset,
                                         uint32_t dst_const_data_offset)
388
{
389
#if 0
390
391
392
    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);

393
394
395
396
397
398
399
400
401
    int32x4_t dst_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
    // fill in byte 2
    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;

    asm volatile("\n \
402
    buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
403
404
    "
                 :
405
406
407
408
                 : "s"(dst_block_setting),
                   "v"(src),
                   "v"(dst_thread_addr_offset),
                   "s"(dst_const_addr_offset));
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
#else
    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);

    int32x4_t dst_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
    // fill in byte 2
    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;

    __llvm_amdgcn_buffer_store(
        src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
#endif
}

template <>
__device__ void __buffer_store<float, 2>(const vector_type<float, 2>::MemoryType& src,
                                         float* p_dst_block,
                                         uint32_t dst_thread_data_offset,
                                         uint32_t dst_const_data_offset)
{
#if 0
    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);

    int32x4_t dst_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
    // fill in byte 2
    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;

    asm volatile("\n \
    buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \
    "
                 :
                 : "s"(dst_block_setting),
                   "v"(src),
                   "v"(dst_thread_addr_offset),
                   "s"(dst_const_addr_offset));
#else
    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);

    int32x4_t dst_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
    // fill in byte 2
    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;

    __llvm_amdgcn_buffer_storex2(
        src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
#endif
}

template <>
__device__ void __buffer_store<float, 4>(const vector_type<float, 4>::MemoryType& src,
                                         float* p_dst_block,
                                         uint32_t dst_thread_data_offset,
                                         uint32_t dst_const_data_offset)
{
#if 0
    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);

    int32x4_t dst_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
    // fill in byte 2
    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;

    asm volatile("\n \
    buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \
    "
                 :
                 : "s"(dst_block_setting),
                   "v"(src),
                   "v"(dst_thread_addr_offset),
                   "s"(dst_const_addr_offset));
#else
    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);

    int32x4_t dst_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
    // fill in byte 2
    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;

    __llvm_amdgcn_buffer_storex4(
        src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
#endif
510
511
}

Chao Liu's avatar
Chao Liu committed
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
__device__ void vmcnt(index_t cnt)
{
    if(cnt == 0)
    {
        asm volatile("\n \
                s_waitcnt vmcnt(0) \n \
                " ::);
    }
    else if(cnt == 1)
    {
        asm volatile("\n \
                s_waitcnt vmcnt(1) \n \
                " ::);
    }
    else if(cnt == 2)
    {
        asm volatile("\n \
                s_waitcnt vmcnt(2) \n \
                " ::);
    }
    else if(cnt == 4)
    {
        asm volatile("\n \
                s_waitcnt vmcnt(2) \n \
                " ::);
    }
    else
    {
        assert(false);
    }
}

__device__ void lgkmcnt(index_t cnt)
{
    if(cnt == 0)
    {
        asm volatile("\n \
                s_waitcnt lgkmcnt(0) \n \
                " ::);
    }
    else if(cnt == 1)
    {
        asm volatile("\n \
                s_waitcnt lgkmcnt(1) \n \
                " ::);
    }
    else if(cnt == 2)
    {
        asm volatile("\n \
                s_waitcnt lgkmcnt(2) \n \
                " ::);
    }
    else if(cnt == 3)
    {
        asm volatile("\n \
                s_waitcnt lgkmcnt(3) \n \
                " ::);
    }
    else if(cnt == 4)
    {
        asm volatile("\n \
                s_waitcnt lgkmcnt(4) \n \
                " ::);
    }
    else
    {
        assert(false);
    }
}

Chao Liu's avatar
Chao Liu committed
582
__device__ void outerProduct1x4(const float* a, const float* b, float* c)
Chao Liu's avatar
Chao Liu committed
583
{
Jing Zhang's avatar
Jing Zhang committed
584
585
586
587
588
589
    asm volatile("\n \
            v_mac_f32 %0, %4, %5 \n \
            v_mac_f32 %1, %4, %6 \n \
            v_mac_f32 %2, %4, %7 \n \
            v_mac_f32 %3, %4, %8 \n \
            "
Chao Liu's avatar
Chao Liu committed
590
591
592
593
594
595
596
597
598
599
                 : "=v"(c[0]), "=v"(c[1]), "=v"(c[2]), "=v"(c[3])
                 : "v"(a[0]),
                   "v"(b[0]),
                   "v"(b[1]),
                   "v"(b[2]),
                   "v"(b[3]),
                   "0"(c[0]),
                   "1"(c[1]),
                   "2"(c[2]),
                   "3"(c[3]));
Jing Zhang's avatar
Jing Zhang committed
600
601
}

Chao Liu's avatar
Chao Liu committed
602
603
604
__device__ void outerProduct1x4(const float& a,
                                const vector_type<float, 4>::MemoryType& b,
                                vector_type<float, 4>::MemoryType& c)
Chao Liu's avatar
Chao Liu committed
605
{
Chao Liu's avatar
Chao Liu committed
606
    outerProduct1x4(&a, reinterpret_cast<const float*>(&b), reinterpret_cast<float*>(&c));
Jing Zhang's avatar
Jing Zhang committed
607
608
}

Chao Liu's avatar
Chao Liu committed
609
610
611
612
613
614
615
616
617
__device__ void outerProduct2x4(const vector_type<float, 2>::MemoryType& a,
                                const vector_type<float, 4>::MemoryType& b,
                                vector_type<float, 4>::MemoryType& c0,
                                vector_type<float, 4>::MemoryType& c1)
{
    outerProduct1x4(a.x, b, c0);
    outerProduct1x4(a.y, b, c1);
}

Chao Liu's avatar
Chao Liu committed
618
619
620
621
622
623
__device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a,
                                const vector_type<float, 4>::MemoryType& b,
                                vector_type<float, 4>::MemoryType& c0,
                                vector_type<float, 4>::MemoryType& c1,
                                vector_type<float, 4>::MemoryType& c2,
                                vector_type<float, 4>::MemoryType& c3)
Chao Liu's avatar
Chao Liu committed
624
{
Jing Zhang's avatar
Jing Zhang committed
625
626
627
628
629
630
    outerProduct1x4(a.x, b, c0);
    outerProduct1x4(a.y, b, c1);
    outerProduct1x4(a.z, b, c2);
    outerProduct1x4(a.w, b, c3);
}

Chao Liu's avatar
Chao Liu committed
631
632
633
__device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
                                const vector_type<float, 4>::MemoryType* b,
                                vector_type<float, 4>::MemoryType* c)
Jing Zhang's avatar
Jing Zhang committed
634
635
636
637
638
639
640
{
    outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]);
    outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]);
    outerProduct4x4(a[1], b[0], c[8], c[10], c[12], c[14]);
    outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]);
}

Chao Liu's avatar
Chao Liu committed
641
__device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
Jing Zhang's avatar
Jing Zhang committed
642
643
644
645
{
    if(offset == 0)
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
646
                ds_read_b128 %0, %1 offset:0\n \
Jing Zhang's avatar
Jing Zhang committed
647
                "
Chao Liu's avatar
Chao Liu committed
648
649
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
650
    }
Jing Zhang's avatar
Jing Zhang committed
651
    if(offset == 64)
Jing Zhang's avatar
Jing Zhang committed
652
653
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
654
                ds_read_b128 %0, %1 offset:64\n \
Jing Zhang's avatar
Jing Zhang committed
655
                "
Chao Liu's avatar
Chao Liu committed
656
657
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
658
    }
Jing Zhang's avatar
Jing Zhang committed
659
    if(offset == 128)
Jing Zhang's avatar
Jing Zhang committed
660
661
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
662
                ds_read_b128 %0, %1 offset:128\n \
Jing Zhang's avatar
Jing Zhang committed
663
                "
Chao Liu's avatar
Chao Liu committed
664
665
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
666
    }
Jing Zhang's avatar
Jing Zhang committed
667
    if(offset == 192)
Jing Zhang's avatar
Jing Zhang committed
668
669
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
670
                ds_read_b128 %0, %1 offset:192\n \
Jing Zhang's avatar
Jing Zhang committed
671
                "
Chao Liu's avatar
Chao Liu committed
672
673
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
674
    }
Jing Zhang's avatar
Jing Zhang committed
675
    if(offset == 256)
Jing Zhang's avatar
Jing Zhang committed
676
677
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
678
                ds_read_b128 %0, %1 offset:256\n \
Jing Zhang's avatar
Jing Zhang committed
679
                "
Chao Liu's avatar
Chao Liu committed
680
681
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
682
    }
Jing Zhang's avatar
Jing Zhang committed
683
    if(offset == 320)
Jing Zhang's avatar
Jing Zhang committed
684
685
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
686
                ds_read_b128 %0, %1 offset:320\n \
Jing Zhang's avatar
Jing Zhang committed
687
                "
Chao Liu's avatar
Chao Liu committed
688
689
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
690
    }
Jing Zhang's avatar
Jing Zhang committed
691
    if(offset == 384)
Jing Zhang's avatar
Jing Zhang committed
692
693
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
694
                ds_read_b128 %0, %1 offset:384\n \
Jing Zhang's avatar
Jing Zhang committed
695
                "
Chao Liu's avatar
Chao Liu committed
696
697
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
698
    }
Jing Zhang's avatar
Jing Zhang committed
699
    if(offset == 448)
Jing Zhang's avatar
Jing Zhang committed
700
701
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
702
                ds_read_b128 %0, %1 offset:448\n \
Jing Zhang's avatar
Jing Zhang committed
703
                "
Chao Liu's avatar
Chao Liu committed
704
705
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
706
    }
Jing Zhang's avatar
Jing Zhang committed
707
    if(offset == 512)
Jing Zhang's avatar
Jing Zhang committed
708
709
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
710
                ds_read_b128 %0, %1 offset:512\n \
Jing Zhang's avatar
Jing Zhang committed
711
                "
Chao Liu's avatar
Chao Liu committed
712
713
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
714
    }
Jing Zhang's avatar
Jing Zhang committed
715
    if(offset == 576)
Jing Zhang's avatar
Jing Zhang committed
716
717
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
718
                ds_read_b128 %0, %1 offset:576\n \
Jing Zhang's avatar
Jing Zhang committed
719
                "
Chao Liu's avatar
Chao Liu committed
720
721
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
722
    }
Jing Zhang's avatar
Jing Zhang committed
723
    if(offset == 640)
Jing Zhang's avatar
Jing Zhang committed
724
725
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
726
                ds_read_b128 %0, %1 offset:640\n \
Jing Zhang's avatar
Jing Zhang committed
727
                "
Chao Liu's avatar
Chao Liu committed
728
729
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
730
    }
Jing Zhang's avatar
Jing Zhang committed
731
    if(offset == 704)
Jing Zhang's avatar
Jing Zhang committed
732
733
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
734
                ds_read_b128 %0, %1 offset:704\n \
Jing Zhang's avatar
Jing Zhang committed
735
                "
Chao Liu's avatar
Chao Liu committed
736
737
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
738
    }
Jing Zhang's avatar
Jing Zhang committed
739
    if(offset == 768)
Jing Zhang's avatar
Jing Zhang committed
740
741
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
742
                ds_read_b128 %0, %1 offset:768\n \
Jing Zhang's avatar
Jing Zhang committed
743
                "
Chao Liu's avatar
Chao Liu committed
744
745
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
746
    }
Jing Zhang's avatar
Jing Zhang committed
747
    if(offset == 832)
Jing Zhang's avatar
Jing Zhang committed
748
749
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
750
                ds_read_b128 %0, %1 offset:832\n \
Jing Zhang's avatar
Jing Zhang committed
751
                "
Chao Liu's avatar
Chao Liu committed
752
753
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
754
    }
Jing Zhang's avatar
Jing Zhang committed
755
    if(offset == 896)
Jing Zhang's avatar
Jing Zhang committed
756
757
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
758
                ds_read_b128 %0, %1 offset:896\n \
Jing Zhang's avatar
Jing Zhang committed
759
                "
Chao Liu's avatar
Chao Liu committed
760
761
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
762
    }
Jing Zhang's avatar
Jing Zhang committed
763
    if(offset == 960)
Jing Zhang's avatar
Jing Zhang committed
764
765
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
766
                ds_read_b128 %0, %1 offset:960\n \
Jing Zhang's avatar
Jing Zhang committed
767
                "
Chao Liu's avatar
Chao Liu committed
768
769
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
770
    }
Jing Zhang's avatar
Jing Zhang committed
771
    if(offset == 1024)
Jing Zhang's avatar
Jing Zhang committed
772
773
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
774
                ds_read_b128 %0, %1 offset:1024\n \
Jing Zhang's avatar
Jing Zhang committed
775
                "
Chao Liu's avatar
Chao Liu committed
776
777
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
778
    }
Jing Zhang's avatar
Jing Zhang committed
779
    if(offset == 1088)
Jing Zhang's avatar
Jing Zhang committed
780
781
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
782
                ds_read_b128 %0, %1 offset:1088\n \
Jing Zhang's avatar
Jing Zhang committed
783
                "
Chao Liu's avatar
Chao Liu committed
784
785
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
786
    }
Jing Zhang's avatar
Jing Zhang committed
787
    if(offset == 1152)
Jing Zhang's avatar
Jing Zhang committed
788
789
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
790
                ds_read_b128 %0, %1 offset:1152\n \
Jing Zhang's avatar
Jing Zhang committed
791
                "
Chao Liu's avatar
Chao Liu committed
792
793
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
794
    }
Jing Zhang's avatar
Jing Zhang committed
795
    if(offset == 1216)
Chao Liu's avatar
Chao Liu committed
796
797
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
798
                ds_read_b128 %0, %1 offset:1216\n \
Chao Liu's avatar
Chao Liu committed
799
                "
Chao Liu's avatar
Chao Liu committed
800
801
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
802
    }
Jing Zhang's avatar
Jing Zhang committed
803
    if(offset == 1280)
Jing Zhang's avatar
Jing Zhang committed
804
805
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
806
                ds_read_b128 %0, %1 offset:1280\n \
Jing Zhang's avatar
Jing Zhang committed
807
                "
Chao Liu's avatar
Chao Liu committed
808
809
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
810
    }
Jing Zhang's avatar
Jing Zhang committed
811
    if(offset == 1344)
Chao Liu's avatar
Chao Liu committed
812
813
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
814
                ds_read_b128 %0, %1 offset:1344\n \
Chao Liu's avatar
Chao Liu committed
815
                "
Chao Liu's avatar
Chao Liu committed
816
817
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
818
    }
Jing Zhang's avatar
Jing Zhang committed
819
    if(offset == 1408)
Jing Zhang's avatar
Jing Zhang committed
820
821
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
822
                ds_read_b128 %0, %1 offset:1408\n \
Jing Zhang's avatar
Jing Zhang committed
823
                "
Chao Liu's avatar
Chao Liu committed
824
825
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
826
    }
Jing Zhang's avatar
Jing Zhang committed
827
    if(offset == 1472)
Chao Liu's avatar
Chao Liu committed
828
829
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
830
                ds_read_b128 %0, %1 offset:1472\n \
Chao Liu's avatar
Chao Liu committed
831
                "
Chao Liu's avatar
Chao Liu committed
832
833
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
834
    }
Jing Zhang's avatar
Jing Zhang committed
835
    if(offset == 1536)
Jing Zhang's avatar
Jing Zhang committed
836
837
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
838
                ds_read_b128 %0, %1 offset:1536\n \
Jing Zhang's avatar
Jing Zhang committed
839
                "
Chao Liu's avatar
Chao Liu committed
840
841
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
842
    }
Jing Zhang's avatar
Jing Zhang committed
843
    if(offset == 1600)
Chao Liu's avatar
Chao Liu committed
844
845
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
846
                ds_read_b128 %0, %1 offset:1600\n \
Chao Liu's avatar
Chao Liu committed
847
                "
Chao Liu's avatar
Chao Liu committed
848
849
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
850
    }
Jing Zhang's avatar
Jing Zhang committed
851
    if(offset == 1664)
Jing Zhang's avatar
Jing Zhang committed
852
853
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
854
                ds_read_b128 %0, %1 offset:1664\n \
Jing Zhang's avatar
Jing Zhang committed
855
                "
Chao Liu's avatar
Chao Liu committed
856
857
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
858
    }
Jing Zhang's avatar
Jing Zhang committed
859
    if(offset == 1728)
Chao Liu's avatar
Chao Liu committed
860
861
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
862
                ds_read_b128 %0, %1 offset:1728\n \
Chao Liu's avatar
Chao Liu committed
863
                "
Chao Liu's avatar
Chao Liu committed
864
865
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
866
    }
Jing Zhang's avatar
Jing Zhang committed
867
    if(offset == 1792)
Jing Zhang's avatar
Jing Zhang committed
868
869
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
870
                ds_read_b128 %0, %1 offset:1792\n \
Jing Zhang's avatar
Jing Zhang committed
871
                "
Chao Liu's avatar
Chao Liu committed
872
873
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
874
    }
Jing Zhang's avatar
Jing Zhang committed
875
    if(offset == 1856)
Chao Liu's avatar
Chao Liu committed
876
877
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
878
                ds_read_b128 %0, %1 offset:1856\n \
Chao Liu's avatar
Chao Liu committed
879
                "
Chao Liu's avatar
Chao Liu committed
880
881
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
882
    }
Jing Zhang's avatar
Jing Zhang committed
883
    if(offset == 1920)
Jing Zhang's avatar
Jing Zhang committed
884
885
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
886
                ds_read_b128 %0, %1 offset:1920\n \
Jing Zhang's avatar
Jing Zhang committed
887
                "
Chao Liu's avatar
Chao Liu committed
888
889
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
890
    }
Jing Zhang's avatar
Jing Zhang committed
891
    if(offset == 1984)
Chao Liu's avatar
Chao Liu committed
892
893
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
894
                ds_read_b128 %0, %1 offset:1984\n \
Chao Liu's avatar
Chao Liu committed
895
                "
Chao Liu's avatar
Chao Liu committed
896
897
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
898
    }
Jing Zhang's avatar
Jing Zhang committed
899
    if(offset == 2048)
Jing Zhang's avatar
Jing Zhang committed
900
901
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
902
                ds_read_b128 %0, %1 offset:2048\n \
Jing Zhang's avatar
Jing Zhang committed
903
                "
Chao Liu's avatar
Chao Liu committed
904
905
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
906
    }
Jing Zhang's avatar
Jing Zhang committed
907
    if(offset == 2112)
Jing Zhang's avatar
Jing Zhang committed
908
909
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
910
                ds_read_b128 %0, %1 offset:2112\n \
Jing Zhang's avatar
Jing Zhang committed
911
                "
Chao Liu's avatar
Chao Liu committed
912
913
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
914
    }
Jing Zhang's avatar
Jing Zhang committed
915
    if(offset == 2176)
Jing Zhang's avatar
Jing Zhang committed
916
    {
Jing Zhang's avatar
Jing Zhang committed
917
918
919
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2176\n \
                "
Chao Liu's avatar
Chao Liu committed
920
921
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
922
923
924
925
926
927
    }
    if(offset == 2240)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2240\n \
                "
Chao Liu's avatar
Chao Liu committed
928
929
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
930
931
932
933
934
935
    }
    if(offset == 2304)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2304\n \
                "
Chao Liu's avatar
Chao Liu committed
936
937
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
938
939
940
941
942
943
    }
    if(offset == 2368)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2368\n \
                "
Chao Liu's avatar
Chao Liu committed
944
945
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
946
947
948
949
950
951
    }
    if(offset == 2432)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2432\n \
                "
Chao Liu's avatar
Chao Liu committed
952
953
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
954
955
956
957
958
959
    }
    if(offset == 2496)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2496\n \
                "
Chao Liu's avatar
Chao Liu committed
960
961
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
962
963
964
965
966
967
    }
    if(offset == 2560)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2560\n \
                "
Chao Liu's avatar
Chao Liu committed
968
969
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
970
971
972
973
974
975
    }
    if(offset == 2624)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2624\n \
                "
Chao Liu's avatar
Chao Liu committed
976
977
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
978
979
980
981
982
983
    }
    if(offset == 2688)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2688\n \
                "
Chao Liu's avatar
Chao Liu committed
984
985
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
986
987
988
989
990
991
    }
    if(offset == 2752)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2752\n \
                "
Chao Liu's avatar
Chao Liu committed
992
993
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
994
995
996
997
998
999
    }
    if(offset == 2816)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2816\n \
                "
Chao Liu's avatar
Chao Liu committed
1000
1001
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1002
1003
1004
1005
1006
1007
    }
    if(offset == 2880)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2880\n \
                "
Chao Liu's avatar
Chao Liu committed
1008
1009
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1010
1011
1012
1013
1014
1015
    }
    if(offset == 2944)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2944\n \
                "
Chao Liu's avatar
Chao Liu committed
1016
1017
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1018
1019
1020
1021
1022
1023
    }
    if(offset == 3008)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3008\n \
                "
Chao Liu's avatar
Chao Liu committed
1024
1025
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1026
1027
1028
1029
1030
1031
    }
    if(offset == 3072)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3072\n \
                "
Chao Liu's avatar
Chao Liu committed
1032
1033
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1034
1035
1036
1037
1038
1039
    }
    if(offset == 3136)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3136\n \
                "
Chao Liu's avatar
Chao Liu committed
1040
1041
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1042
1043
1044
1045
1046
1047
    }
    if(offset == 3200)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3200\n \
                "
Chao Liu's avatar
Chao Liu committed
1048
1049
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1050
1051
1052
1053
1054
1055
    }
    if(offset == 3264)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3264\n \
                "
Chao Liu's avatar
Chao Liu committed
1056
1057
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1058
1059
1060
1061
1062
1063
    }
    if(offset == 3328)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3328\n \
                "
Chao Liu's avatar
Chao Liu committed
1064
1065
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1066
1067
1068
1069
1070
1071
    }
    if(offset == 3392)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3392\n \
                "
Chao Liu's avatar
Chao Liu committed
1072
1073
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1074
1075
1076
1077
1078
1079
    }
    if(offset == 3456)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3456\n \
                "
Chao Liu's avatar
Chao Liu committed
1080
1081
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1082
1083
1084
1085
1086
1087
    }
    if(offset == 3520)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3520\n \
                "
Chao Liu's avatar
Chao Liu committed
1088
1089
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1090
1091
1092
1093
1094
1095
    }
    if(offset == 3584)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3584\n \
                "
Chao Liu's avatar
Chao Liu committed
1096
1097
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1098
1099
1100
1101
1102
1103
    }
    if(offset == 3648)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3648\n \
                "
Chao Liu's avatar
Chao Liu committed
1104
1105
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1106
1107
1108
1109
1110
1111
    }
    if(offset == 3712)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3712\n \
                "
Chao Liu's avatar
Chao Liu committed
1112
1113
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1114
1115
1116
1117
1118
1119
    }
    if(offset == 3776)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3776\n \
                "
Chao Liu's avatar
Chao Liu committed
1120
1121
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1122
1123
1124
1125
1126
1127
    }
    if(offset == 3840)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3840\n \
                "
Chao Liu's avatar
Chao Liu committed
1128
1129
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1130
1131
1132
1133
1134
1135
    }
    if(offset == 3904)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3904\n \
                "
Chao Liu's avatar
Chao Liu committed
1136
1137
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1138
1139
1140
1141
1142
1143
    }
    if(offset == 3968)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3968\n \
                "
Chao Liu's avatar
Chao Liu committed
1144
1145
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1146
1147
1148
1149
1150
1151
    }
    if(offset == 4032)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:4032\n \
                "
Chao Liu's avatar
Chao Liu committed
1152
1153
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1154
1155
1156
1157
1158
1159
    }
    if(offset == 4096)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:4096\n \
                "
Chao Liu's avatar
Chao Liu committed
1160
1161
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
1162
    }
Jing Zhang's avatar
Jing Zhang committed
1163
1164
}

Chao Liu's avatar
Chao Liu committed
1165
1166
__device__ void
ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
Jing Zhang's avatar
Jing Zhang committed
1167
{
1168
1169
1170
    if(offset == 0)
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
1171
1172
            ds_write_b128 %0, %1 \n \
            "
1173
1174
1175
1176
1177
1178
1179
                     :
                     : "v"(__to_local(lds)), "v"(r));
    }
    else
    {
        assert(false);
    }
Jing Zhang's avatar
Jing Zhang committed
1180
}
1181
1182
1183

} // namespace ck
#endif