amd_inline_asm.hpp 21.2 KB
Newer Older
1
2
3
#ifndef CK_AMD_INLINE_ASM_HPP
#define CK_AMD_INLINE_ASM_HPP

Chao Liu's avatar
Chao Liu committed
4
#include "vector_type.hpp"
Jing Zhang's avatar
Jing Zhang committed
5

6
7
namespace ck {

Chao Liu's avatar
Chao Liu committed
8
// cast a pointer of LDS to its address
Chao Liu's avatar
Chao Liu committed
9
extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);
Chao Liu's avatar
Chao Liu committed
10

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
// buffer_load and buffer_store
template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType
buffer_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset);

template <typename T, index_t VectorSize>
__device__ void buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
                             T* p_dst_block,
                             uint32_t dst_thread_offset,
                             uint32_t dst_const_offset);

template <>
__device__ float buffer_load<float, 1>(const float* p_src_block,
                                       uint32_t src_thread_offset,
                                       uint32_t src_const_offset)
{
    float dst;

    int32x4_t src_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
    // fill in byte 2
    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;

    asm volatile("\n \
    buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
    s_waitcnt 0 \n \
    "
                 : "=v"(dst)
                 : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));

    return dst;
}

template <>
__device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float* p_src_block,
                                                                   uint32_t src_thread_offset,
                                                                   uint32_t src_const_offset)
{
    vector_type<float, 2>::MemoryType dst;

    int32x4_t src_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
    // fill in byte 2
    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;

    asm volatile("\n \
    buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
    s_waitcnt 0 \n \
    "
                 : "=v"(dst)
                 : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));

    return dst;
}

template <>
__device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float* p_src_block,
                                                                   uint32_t src_thread_offset,
                                                                   uint32_t src_const_offset)
{
    vector_type<float, 4>::MemoryType dst;

    int32x4_t src_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
    // fill in byte 2
    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;

    asm volatile("\n \
    buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
    s_waitcnt 0 \n \
    "
                 : "=v"(dst)
                 : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));

    return dst;
}

template <>
__device__ void buffer_store<float, 1>(const float& src,
                                       float* p_dst_block,
                                       uint32_t dst_thread_offset,
                                       uint32_t dst_const_offset)
{
    int32x4_t dst_block_setting{0};
    // fill in byte 0 - 1
    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
    // fill in byte 2
    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
    // fill in byte 3
    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;

    asm volatile("\n \
    buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
    s_waitcnt 0 \n \
    "
                 :
                 : "s"(dst_block_setting), "v"(src), "v"(dst_thread_offset), "s"(dst_const_offset));
}

Chao Liu's avatar
Chao Liu committed
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
__device__ void vmcnt(index_t cnt)
{
    if(cnt == 0)
    {
        asm volatile("\n \
                s_waitcnt vmcnt(0) \n \
                " ::);
    }
    else if(cnt == 1)
    {
        asm volatile("\n \
                s_waitcnt vmcnt(1) \n \
                " ::);
    }
    else if(cnt == 2)
    {
        asm volatile("\n \
                s_waitcnt vmcnt(2) \n \
                " ::);
    }
    else if(cnt == 4)
    {
        asm volatile("\n \
                s_waitcnt vmcnt(2) \n \
                " ::);
    }
    else
    {
        assert(false);
    }
}

__device__ void lgkmcnt(index_t cnt)
{
    if(cnt == 0)
    {
        asm volatile("\n \
                s_waitcnt lgkmcnt(0) \n \
                " ::);
    }
    else if(cnt == 1)
    {
        asm volatile("\n \
                s_waitcnt lgkmcnt(1) \n \
                " ::);
    }
    else if(cnt == 2)
    {
        asm volatile("\n \
                s_waitcnt lgkmcnt(2) \n \
                " ::);
    }
    else if(cnt == 3)
    {
        asm volatile("\n \
                s_waitcnt lgkmcnt(3) \n \
                " ::);
    }
    else if(cnt == 4)
    {
        asm volatile("\n \
                s_waitcnt lgkmcnt(4) \n \
                " ::);
    }
    else
    {
        assert(false);
    }
}

Chao Liu's avatar
Chao Liu committed
189
__device__ void outerProduct1x4(const float* a, const float* b, float* c)
Chao Liu's avatar
Chao Liu committed
190
{
Jing Zhang's avatar
Jing Zhang committed
191
192
193
194
195
196
    asm volatile("\n \
            v_mac_f32 %0, %4, %5 \n \
            v_mac_f32 %1, %4, %6 \n \
            v_mac_f32 %2, %4, %7 \n \
            v_mac_f32 %3, %4, %8 \n \
            "
Chao Liu's avatar
Chao Liu committed
197
198
199
200
201
202
203
204
205
206
                 : "=v"(c[0]), "=v"(c[1]), "=v"(c[2]), "=v"(c[3])
                 : "v"(a[0]),
                   "v"(b[0]),
                   "v"(b[1]),
                   "v"(b[2]),
                   "v"(b[3]),
                   "0"(c[0]),
                   "1"(c[1]),
                   "2"(c[2]),
                   "3"(c[3]));
Jing Zhang's avatar
Jing Zhang committed
207
208
}

Chao Liu's avatar
Chao Liu committed
209
210
211
__device__ void outerProduct1x4(const float& a,
                                const vector_type<float, 4>::MemoryType& b,
                                vector_type<float, 4>::MemoryType& c)
Chao Liu's avatar
Chao Liu committed
212
{
Chao Liu's avatar
Chao Liu committed
213
    outerProduct1x4(&a, reinterpret_cast<const float*>(&b), reinterpret_cast<float*>(&c));
Jing Zhang's avatar
Jing Zhang committed
214
215
}

Chao Liu's avatar
Chao Liu committed
216
217
218
219
220
221
222
223
224
__device__ void outerProduct2x4(const vector_type<float, 2>::MemoryType& a,
                                const vector_type<float, 4>::MemoryType& b,
                                vector_type<float, 4>::MemoryType& c0,
                                vector_type<float, 4>::MemoryType& c1)
{
    outerProduct1x4(a.x, b, c0);
    outerProduct1x4(a.y, b, c1);
}

Chao Liu's avatar
Chao Liu committed
225
226
227
228
229
230
__device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a,
                                const vector_type<float, 4>::MemoryType& b,
                                vector_type<float, 4>::MemoryType& c0,
                                vector_type<float, 4>::MemoryType& c1,
                                vector_type<float, 4>::MemoryType& c2,
                                vector_type<float, 4>::MemoryType& c3)
Chao Liu's avatar
Chao Liu committed
231
{
Jing Zhang's avatar
Jing Zhang committed
232
233
234
235
236
237
    outerProduct1x4(a.x, b, c0);
    outerProduct1x4(a.y, b, c1);
    outerProduct1x4(a.z, b, c2);
    outerProduct1x4(a.w, b, c3);
}

Chao Liu's avatar
Chao Liu committed
238
239
240
__device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
                                const vector_type<float, 4>::MemoryType* b,
                                vector_type<float, 4>::MemoryType* c)
Jing Zhang's avatar
Jing Zhang committed
241
242
243
244
245
246
247
{
    outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]);
    outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]);
    outerProduct4x4(a[1], b[0], c[8], c[10], c[12], c[14]);
    outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]);
}

Chao Liu's avatar
Chao Liu committed
248
__device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
Jing Zhang's avatar
Jing Zhang committed
249
250
251
252
{
    if(offset == 0)
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
253
                ds_read_b128 %0, %1 offset:0\n \
Jing Zhang's avatar
Jing Zhang committed
254
                "
Chao Liu's avatar
Chao Liu committed
255
256
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
257
    }
Jing Zhang's avatar
Jing Zhang committed
258
    if(offset == 64)
Jing Zhang's avatar
Jing Zhang committed
259
260
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
261
                ds_read_b128 %0, %1 offset:64\n \
Jing Zhang's avatar
Jing Zhang committed
262
                "
Chao Liu's avatar
Chao Liu committed
263
264
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
265
    }
Jing Zhang's avatar
Jing Zhang committed
266
    if(offset == 128)
Jing Zhang's avatar
Jing Zhang committed
267
268
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
269
                ds_read_b128 %0, %1 offset:128\n \
Jing Zhang's avatar
Jing Zhang committed
270
                "
Chao Liu's avatar
Chao Liu committed
271
272
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
273
    }
Jing Zhang's avatar
Jing Zhang committed
274
    if(offset == 192)
Jing Zhang's avatar
Jing Zhang committed
275
276
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
277
                ds_read_b128 %0, %1 offset:192\n \
Jing Zhang's avatar
Jing Zhang committed
278
                "
Chao Liu's avatar
Chao Liu committed
279
280
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
281
    }
Jing Zhang's avatar
Jing Zhang committed
282
    if(offset == 256)
Jing Zhang's avatar
Jing Zhang committed
283
284
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
285
                ds_read_b128 %0, %1 offset:256\n \
Jing Zhang's avatar
Jing Zhang committed
286
                "
Chao Liu's avatar
Chao Liu committed
287
288
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
289
    }
Jing Zhang's avatar
Jing Zhang committed
290
    if(offset == 320)
Jing Zhang's avatar
Jing Zhang committed
291
292
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
293
                ds_read_b128 %0, %1 offset:320\n \
Jing Zhang's avatar
Jing Zhang committed
294
                "
Chao Liu's avatar
Chao Liu committed
295
296
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
297
    }
Jing Zhang's avatar
Jing Zhang committed
298
    if(offset == 384)
Jing Zhang's avatar
Jing Zhang committed
299
300
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
301
                ds_read_b128 %0, %1 offset:384\n \
Jing Zhang's avatar
Jing Zhang committed
302
                "
Chao Liu's avatar
Chao Liu committed
303
304
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
305
    }
Jing Zhang's avatar
Jing Zhang committed
306
    if(offset == 448)
Jing Zhang's avatar
Jing Zhang committed
307
308
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
309
                ds_read_b128 %0, %1 offset:448\n \
Jing Zhang's avatar
Jing Zhang committed
310
                "
Chao Liu's avatar
Chao Liu committed
311
312
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
313
    }
Jing Zhang's avatar
Jing Zhang committed
314
    if(offset == 512)
Jing Zhang's avatar
Jing Zhang committed
315
316
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
317
                ds_read_b128 %0, %1 offset:512\n \
Jing Zhang's avatar
Jing Zhang committed
318
                "
Chao Liu's avatar
Chao Liu committed
319
320
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
321
    }
Jing Zhang's avatar
Jing Zhang committed
322
    if(offset == 576)
Jing Zhang's avatar
Jing Zhang committed
323
324
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
325
                ds_read_b128 %0, %1 offset:576\n \
Jing Zhang's avatar
Jing Zhang committed
326
                "
Chao Liu's avatar
Chao Liu committed
327
328
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
329
    }
Jing Zhang's avatar
Jing Zhang committed
330
    if(offset == 640)
Jing Zhang's avatar
Jing Zhang committed
331
332
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
333
                ds_read_b128 %0, %1 offset:640\n \
Jing Zhang's avatar
Jing Zhang committed
334
                "
Chao Liu's avatar
Chao Liu committed
335
336
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
337
    }
Jing Zhang's avatar
Jing Zhang committed
338
    if(offset == 704)
Jing Zhang's avatar
Jing Zhang committed
339
340
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
341
                ds_read_b128 %0, %1 offset:704\n \
Jing Zhang's avatar
Jing Zhang committed
342
                "
Chao Liu's avatar
Chao Liu committed
343
344
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
345
    }
Jing Zhang's avatar
Jing Zhang committed
346
    if(offset == 768)
Jing Zhang's avatar
Jing Zhang committed
347
348
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
349
                ds_read_b128 %0, %1 offset:768\n \
Jing Zhang's avatar
Jing Zhang committed
350
                "
Chao Liu's avatar
Chao Liu committed
351
352
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
353
    }
Jing Zhang's avatar
Jing Zhang committed
354
    if(offset == 832)
Jing Zhang's avatar
Jing Zhang committed
355
356
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
357
                ds_read_b128 %0, %1 offset:832\n \
Jing Zhang's avatar
Jing Zhang committed
358
                "
Chao Liu's avatar
Chao Liu committed
359
360
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
361
    }
Jing Zhang's avatar
Jing Zhang committed
362
    if(offset == 896)
Jing Zhang's avatar
Jing Zhang committed
363
364
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
365
                ds_read_b128 %0, %1 offset:896\n \
Jing Zhang's avatar
Jing Zhang committed
366
                "
Chao Liu's avatar
Chao Liu committed
367
368
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
369
    }
Jing Zhang's avatar
Jing Zhang committed
370
    if(offset == 960)
Jing Zhang's avatar
Jing Zhang committed
371
372
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
373
                ds_read_b128 %0, %1 offset:960\n \
Jing Zhang's avatar
Jing Zhang committed
374
                "
Chao Liu's avatar
Chao Liu committed
375
376
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
377
    }
Jing Zhang's avatar
Jing Zhang committed
378
    if(offset == 1024)
Jing Zhang's avatar
Jing Zhang committed
379
380
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
381
                ds_read_b128 %0, %1 offset:1024\n \
Jing Zhang's avatar
Jing Zhang committed
382
                "
Chao Liu's avatar
Chao Liu committed
383
384
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
385
    }
Jing Zhang's avatar
Jing Zhang committed
386
    if(offset == 1088)
Jing Zhang's avatar
Jing Zhang committed
387
388
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
389
                ds_read_b128 %0, %1 offset:1088\n \
Jing Zhang's avatar
Jing Zhang committed
390
                "
Chao Liu's avatar
Chao Liu committed
391
392
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
393
    }
Jing Zhang's avatar
Jing Zhang committed
394
    if(offset == 1152)
Jing Zhang's avatar
Jing Zhang committed
395
396
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
397
                ds_read_b128 %0, %1 offset:1152\n \
Jing Zhang's avatar
Jing Zhang committed
398
                "
Chao Liu's avatar
Chao Liu committed
399
400
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
401
    }
Jing Zhang's avatar
Jing Zhang committed
402
    if(offset == 1216)
Chao Liu's avatar
Chao Liu committed
403
404
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
405
                ds_read_b128 %0, %1 offset:1216\n \
Chao Liu's avatar
Chao Liu committed
406
                "
Chao Liu's avatar
Chao Liu committed
407
408
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
409
    }
Jing Zhang's avatar
Jing Zhang committed
410
    if(offset == 1280)
Jing Zhang's avatar
Jing Zhang committed
411
412
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
413
                ds_read_b128 %0, %1 offset:1280\n \
Jing Zhang's avatar
Jing Zhang committed
414
                "
Chao Liu's avatar
Chao Liu committed
415
416
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
417
    }
Jing Zhang's avatar
Jing Zhang committed
418
    if(offset == 1344)
Chao Liu's avatar
Chao Liu committed
419
420
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
421
                ds_read_b128 %0, %1 offset:1344\n \
Chao Liu's avatar
Chao Liu committed
422
                "
Chao Liu's avatar
Chao Liu committed
423
424
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
425
    }
Jing Zhang's avatar
Jing Zhang committed
426
    if(offset == 1408)
Jing Zhang's avatar
Jing Zhang committed
427
428
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
429
                ds_read_b128 %0, %1 offset:1408\n \
Jing Zhang's avatar
Jing Zhang committed
430
                "
Chao Liu's avatar
Chao Liu committed
431
432
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
433
    }
Jing Zhang's avatar
Jing Zhang committed
434
    if(offset == 1472)
Chao Liu's avatar
Chao Liu committed
435
436
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
437
                ds_read_b128 %0, %1 offset:1472\n \
Chao Liu's avatar
Chao Liu committed
438
                "
Chao Liu's avatar
Chao Liu committed
439
440
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
441
    }
Jing Zhang's avatar
Jing Zhang committed
442
    if(offset == 1536)
Jing Zhang's avatar
Jing Zhang committed
443
444
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
445
                ds_read_b128 %0, %1 offset:1536\n \
Jing Zhang's avatar
Jing Zhang committed
446
                "
Chao Liu's avatar
Chao Liu committed
447
448
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
449
    }
Jing Zhang's avatar
Jing Zhang committed
450
    if(offset == 1600)
Chao Liu's avatar
Chao Liu committed
451
452
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
453
                ds_read_b128 %0, %1 offset:1600\n \
Chao Liu's avatar
Chao Liu committed
454
                "
Chao Liu's avatar
Chao Liu committed
455
456
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
457
    }
Jing Zhang's avatar
Jing Zhang committed
458
    if(offset == 1664)
Jing Zhang's avatar
Jing Zhang committed
459
460
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
461
                ds_read_b128 %0, %1 offset:1664\n \
Jing Zhang's avatar
Jing Zhang committed
462
                "
Chao Liu's avatar
Chao Liu committed
463
464
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
465
    }
Jing Zhang's avatar
Jing Zhang committed
466
    if(offset == 1728)
Chao Liu's avatar
Chao Liu committed
467
468
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
469
                ds_read_b128 %0, %1 offset:1728\n \
Chao Liu's avatar
Chao Liu committed
470
                "
Chao Liu's avatar
Chao Liu committed
471
472
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
473
    }
Jing Zhang's avatar
Jing Zhang committed
474
    if(offset == 1792)
Jing Zhang's avatar
Jing Zhang committed
475
476
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
477
                ds_read_b128 %0, %1 offset:1792\n \
Jing Zhang's avatar
Jing Zhang committed
478
                "
Chao Liu's avatar
Chao Liu committed
479
480
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
481
    }
Jing Zhang's avatar
Jing Zhang committed
482
    if(offset == 1856)
Chao Liu's avatar
Chao Liu committed
483
484
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
485
                ds_read_b128 %0, %1 offset:1856\n \
Chao Liu's avatar
Chao Liu committed
486
                "
Chao Liu's avatar
Chao Liu committed
487
488
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
489
    }
Jing Zhang's avatar
Jing Zhang committed
490
    if(offset == 1920)
Jing Zhang's avatar
Jing Zhang committed
491
492
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
493
                ds_read_b128 %0, %1 offset:1920\n \
Jing Zhang's avatar
Jing Zhang committed
494
                "
Chao Liu's avatar
Chao Liu committed
495
496
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
497
    }
Jing Zhang's avatar
Jing Zhang committed
498
    if(offset == 1984)
Chao Liu's avatar
Chao Liu committed
499
500
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
501
                ds_read_b128 %0, %1 offset:1984\n \
Chao Liu's avatar
Chao Liu committed
502
                "
Chao Liu's avatar
Chao Liu committed
503
504
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
505
    }
Jing Zhang's avatar
Jing Zhang committed
506
    if(offset == 2048)
Jing Zhang's avatar
Jing Zhang committed
507
508
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
509
                ds_read_b128 %0, %1 offset:2048\n \
Jing Zhang's avatar
Jing Zhang committed
510
                "
Chao Liu's avatar
Chao Liu committed
511
512
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
513
    }
Jing Zhang's avatar
Jing Zhang committed
514
    if(offset == 2112)
Jing Zhang's avatar
Jing Zhang committed
515
516
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
517
                ds_read_b128 %0, %1 offset:2112\n \
Jing Zhang's avatar
Jing Zhang committed
518
                "
Chao Liu's avatar
Chao Liu committed
519
520
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
521
    }
Jing Zhang's avatar
Jing Zhang committed
522
    if(offset == 2176)
Jing Zhang's avatar
Jing Zhang committed
523
    {
Jing Zhang's avatar
Jing Zhang committed
524
525
526
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2176\n \
                "
Chao Liu's avatar
Chao Liu committed
527
528
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
529
530
531
532
533
534
    }
    if(offset == 2240)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2240\n \
                "
Chao Liu's avatar
Chao Liu committed
535
536
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
537
538
539
540
541
542
    }
    if(offset == 2304)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2304\n \
                "
Chao Liu's avatar
Chao Liu committed
543
544
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
545
546
547
548
549
550
    }
    if(offset == 2368)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2368\n \
                "
Chao Liu's avatar
Chao Liu committed
551
552
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
553
554
555
556
557
558
    }
    if(offset == 2432)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2432\n \
                "
Chao Liu's avatar
Chao Liu committed
559
560
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
561
562
563
564
565
566
    }
    if(offset == 2496)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2496\n \
                "
Chao Liu's avatar
Chao Liu committed
567
568
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
569
570
571
572
573
574
    }
    if(offset == 2560)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2560\n \
                "
Chao Liu's avatar
Chao Liu committed
575
576
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
577
578
579
580
581
582
    }
    if(offset == 2624)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2624\n \
                "
Chao Liu's avatar
Chao Liu committed
583
584
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
585
586
587
588
589
590
    }
    if(offset == 2688)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2688\n \
                "
Chao Liu's avatar
Chao Liu committed
591
592
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
593
594
595
596
597
598
    }
    if(offset == 2752)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2752\n \
                "
Chao Liu's avatar
Chao Liu committed
599
600
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
601
602
603
604
605
606
    }
    if(offset == 2816)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2816\n \
                "
Chao Liu's avatar
Chao Liu committed
607
608
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
609
610
611
612
613
614
    }
    if(offset == 2880)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2880\n \
                "
Chao Liu's avatar
Chao Liu committed
615
616
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
617
618
619
620
621
622
    }
    if(offset == 2944)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2944\n \
                "
Chao Liu's avatar
Chao Liu committed
623
624
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
625
626
627
628
629
630
    }
    if(offset == 3008)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3008\n \
                "
Chao Liu's avatar
Chao Liu committed
631
632
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
633
634
635
636
637
638
    }
    if(offset == 3072)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3072\n \
                "
Chao Liu's avatar
Chao Liu committed
639
640
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
641
642
643
644
645
646
    }
    if(offset == 3136)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3136\n \
                "
Chao Liu's avatar
Chao Liu committed
647
648
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
649
650
651
652
653
654
    }
    if(offset == 3200)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3200\n \
                "
Chao Liu's avatar
Chao Liu committed
655
656
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
657
658
659
660
661
662
    }
    if(offset == 3264)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3264\n \
                "
Chao Liu's avatar
Chao Liu committed
663
664
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
665
666
667
668
669
670
    }
    if(offset == 3328)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3328\n \
                "
Chao Liu's avatar
Chao Liu committed
671
672
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
673
674
675
676
677
678
    }
    if(offset == 3392)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3392\n \
                "
Chao Liu's avatar
Chao Liu committed
679
680
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
681
682
683
684
685
686
    }
    if(offset == 3456)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3456\n \
                "
Chao Liu's avatar
Chao Liu committed
687
688
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
689
690
691
692
693
694
    }
    if(offset == 3520)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3520\n \
                "
Chao Liu's avatar
Chao Liu committed
695
696
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
697
698
699
700
701
702
    }
    if(offset == 3584)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3584\n \
                "
Chao Liu's avatar
Chao Liu committed
703
704
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
705
706
707
708
709
710
    }
    if(offset == 3648)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3648\n \
                "
Chao Liu's avatar
Chao Liu committed
711
712
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
713
714
715
716
717
718
    }
    if(offset == 3712)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3712\n \
                "
Chao Liu's avatar
Chao Liu committed
719
720
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
721
722
723
724
725
726
    }
    if(offset == 3776)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3776\n \
                "
Chao Liu's avatar
Chao Liu committed
727
728
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
729
730
731
732
733
734
    }
    if(offset == 3840)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3840\n \
                "
Chao Liu's avatar
Chao Liu committed
735
736
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
737
738
739
740
741
742
    }
    if(offset == 3904)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3904\n \
                "
Chao Liu's avatar
Chao Liu committed
743
744
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
745
746
747
748
749
750
    }
    if(offset == 3968)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3968\n \
                "
Chao Liu's avatar
Chao Liu committed
751
752
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
753
754
755
756
757
758
    }
    if(offset == 4032)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:4032\n \
                "
Chao Liu's avatar
Chao Liu committed
759
760
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
761
762
763
764
765
766
    }
    if(offset == 4096)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:4096\n \
                "
Chao Liu's avatar
Chao Liu committed
767
768
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
769
    }
Jing Zhang's avatar
Jing Zhang committed
770
771
}

Chao Liu's avatar
Chao Liu committed
772
773
__device__ void
ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
Jing Zhang's avatar
Jing Zhang committed
774
{
775
776
777
    if(offset == 0)
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
778
779
            ds_write_b128 %0, %1 \n \
            "
780
781
782
783
784
785
786
                     :
                     : "v"(__to_local(lds)), "v"(r));
    }
    else
    {
        assert(false);
    }
Jing Zhang's avatar
Jing Zhang committed
787
}
788
789
790

} // namespace ck
#endif