amd_inline_asm.hip.hpp 10.9 KB
Newer Older
Jing Zhang's avatar
Jing Zhang committed
1
#pragma once
Chao Liu's avatar
Chao Liu committed
2
#include "common.hip.hpp"
Jing Zhang's avatar
Jing Zhang committed
3

Jing Zhang's avatar
Jing Zhang committed
4
5
6
7
8
9
#define NO_VM_WAIT 0
#define NO_LGKM_WAIT 0
#define NO_DS_READ 0
#define NO_DS_WRITE 0
#define NO_GLB_READ 0

Chao Liu's avatar
Chao Liu committed
10
11
12
// cast a pointer of LDS to its address
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];

Jing Zhang's avatar
Jing Zhang committed
13
14
15
16
17
#define data4_t vector_type<float, 4>::MemoryType
#define data_t float

template<unsigned cnt>
inline __device__ void vmcnt()
18
{
Jing Zhang's avatar
Jing Zhang committed
19
#if !NO_VM_WAIT
20
21
22
    if(cnt == 0)
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
23
                s_waitcnt vmcnt(0) \n \
24
                " ::);
Jing Zhang's avatar
Jing Zhang committed
25
    }
26
27
28
    else if(cnt == 1)
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
29
                s_waitcnt vmcnt(1) \n \
30
                " ::);
Jing Zhang's avatar
Jing Zhang committed
31
    }
32
33
34
    else if(cnt == 2)
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
35
                s_waitcnt vmcnt(2) \n \
36
                " ::);
Jing Zhang's avatar
Jing Zhang committed
37
    }
Jing Zhang's avatar
Jing Zhang committed
38
39
40
41
42
43
    else if(cnt == 3)
    {
        asm volatile("\n \
                s_waitcnt vmcnt(3) \n \
                " ::);
    }
44
45
46
    else if(cnt == 4)
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
47
                s_waitcnt vmcnt(4) \n \
48
                " ::);
Jing Zhang's avatar
Jing Zhang committed
49
    }
50
51
    else
    {
52
        assert(false);
Jing Zhang's avatar
Jing Zhang committed
53
    }
Jing Zhang's avatar
Jing Zhang committed
54
#endif
Jing Zhang's avatar
Jing Zhang committed
55
56
}

Jing Zhang's avatar
Jing Zhang committed
57
inline __device__ void lgkmcnt(index_t cnt)
Chao Liu's avatar
Chao Liu committed
58
{
Jing Zhang's avatar
Jing Zhang committed
59
#if !NO_LGKM_WAIT
Chao Liu's avatar
Chao Liu committed
60
61
    if(cnt == 0)
    {
Jing Zhang's avatar
Jing Zhang committed
62
63
        asm volatile("\n \
                s_waitcnt lgkmcnt(0) \n \
Chao Liu's avatar
Chao Liu committed
64
                " ::);
Jing Zhang's avatar
Jing Zhang committed
65
    }
Chao Liu's avatar
Chao Liu committed
66
67
    else if(cnt == 1)
    {
Jing Zhang's avatar
Jing Zhang committed
68
69
        asm volatile("\n \
                s_waitcnt lgkmcnt(1) \n \
Chao Liu's avatar
Chao Liu committed
70
                " ::);
Jing Zhang's avatar
Jing Zhang committed
71
    }
Chao Liu's avatar
Chao Liu committed
72
73
    else if(cnt == 2)
    {
Jing Zhang's avatar
Jing Zhang committed
74
75
        asm volatile("\n \
                s_waitcnt lgkmcnt(2) \n \
Chao Liu's avatar
Chao Liu committed
76
                " ::);
Jing Zhang's avatar
Jing Zhang committed
77
    }
Chao Liu's avatar
Chao Liu committed
78
79
    else if(cnt == 3)
    {
Jing Zhang's avatar
Jing Zhang committed
80
81
        asm volatile("\n \
                s_waitcnt lgkmcnt(3) \n \
Chao Liu's avatar
Chao Liu committed
82
                " ::);
Jing Zhang's avatar
Jing Zhang committed
83
    }
Chao Liu's avatar
Chao Liu committed
84
85
    else if(cnt == 4)
    {
Jing Zhang's avatar
Jing Zhang committed
86
87
        asm volatile("\n \
                s_waitcnt lgkmcnt(4) \n \
Chao Liu's avatar
Chao Liu committed
88
                " ::);
Jing Zhang's avatar
Jing Zhang committed
89
    }
Chao Liu's avatar
Chao Liu committed
90
91
    else
    {
92
        assert(false);
Jing Zhang's avatar
Jing Zhang committed
93
    }
Jing Zhang's avatar
Jing Zhang committed
94
#endif
Jing Zhang's avatar
Jing Zhang committed
95
96
}

Jing Zhang's avatar
Jing Zhang committed
97
98
99
inline __device__ void outerProduct1x4(const data_t& a,
                                const data4_t& b,
                                data4_t& c)
Chao Liu's avatar
Chao Liu committed
100
{
Jing Zhang's avatar
Jing Zhang committed
101
#if 0 
Jing Zhang's avatar
Jing Zhang committed
102
103
104
105
106
107
108
109
110
111
112
113
    asm volatile(
            "\n \
            v_mac_f32 %0, %4, %5 \n \
            v_mac_f32 %1, %4, %6 \n \
            v_mac_f32 %2, %4, %7 \n \
            v_mac_f32 %3, %4, %8 \n \
            "
            :
            :"v"(c.x),"v"(c.y),"v"(c.z),"v"(c.w), \
            "v"(a.x),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
            );
#else
Jing Zhang's avatar
Jing Zhang committed
114
115
116
117
//hijack here due to a compiler issue that cannot perform proper register
//mapping for float4 c
    data_t *c_p = (data_t *)&c;
    asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
118
119
120
121
122
            v_mac_f32 %0, %4, %5 \n \
            v_mac_f32 %1, %4, %6 \n \
            v_mac_f32 %2, %4, %7 \n \
            v_mac_f32 %3, %4, %8 \n \
            "
Jing Zhang's avatar
Jing Zhang committed
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
                 : "=v"(c[0]), "=v"(c[1]), "=v"(c[2]), "=v"(c[3])
                 : "v"(a),
                   "v"(b.x),
                   "v"(b.y),
                   "v"(b.z),
                   "v"(b.w),
                   "0"(c[0]),
                   "1"(c[1]),
                   "2"(c[2]),
                   "3"(c[3]));
#endif
}

inline __device__ void outerProduct4x4(const data4_t& a,
                                const data4_t& b,
                                data4_t& c0,
                                data4_t& c1,
                                data4_t& c2,
                                data4_t& c3)
{
Jing Zhang's avatar
Jing Zhang committed
143
144
145
146
147
148
    outerProduct1x4(a.x, b, c0);
    outerProduct1x4(a.y, b, c1);
    outerProduct1x4(a.z, b, c2);
    outerProduct1x4(a.w, b, c3);
}

Jing Zhang's avatar
Jing Zhang committed
149
150
151
inline __device__ void outerProduct8x8(const data4_t* a,
                                const data4_t* b,
                                data4_t* c)
Jing Zhang's avatar
Jing Zhang committed
152
153
154
155
156
157
158
{
    outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]);
    outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]);
    outerProduct4x4(a[1], b[0], c[8], c[10], c[12], c[14]);
    outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]);
}

Jing Zhang's avatar
Jing Zhang committed
159
inline __device__ void ds_read_b128(data4_t& r, void* lds, index_t offset = 0)
Jing Zhang's avatar
Jing Zhang committed
160
{
Jing Zhang's avatar
Jing Zhang committed
161
#if !NO_DS_READ
Jing Zhang's avatar
Jing Zhang committed
162
163
164
165
166
    if(offset == 0)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:0 \n \
                "
Chao Liu's avatar
Chao Liu committed
167
168
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
169
170
171
172
173
174
    }
    else if(offset == 128)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:128 \n \
                "
Chao Liu's avatar
Chao Liu committed
175
176
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
177
178
179
180
181
182
    }
    else if(offset == 256)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:256 \n \
                "
Chao Liu's avatar
Chao Liu committed
183
184
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
185
186
187
188
189
190
    }
    else if(offset == 384)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:384 \n \
                "
Chao Liu's avatar
Chao Liu committed
191
192
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
193
194
195
196
197
198
    }
    else if(offset == 512)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:512 \n \
                "
Chao Liu's avatar
Chao Liu committed
199
200
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
201
202
203
204
205
206
    }
    else if(offset == 640)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:640 \n \
                "
Chao Liu's avatar
Chao Liu committed
207
208
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
209
210
211
212
213
214
    }
    else if(offset == 768)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:768 \n \
                "
Chao Liu's avatar
Chao Liu committed
215
216
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
217
218
219
220
221
222
    }
    else if(offset == 896)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:896 \n \
                "
Chao Liu's avatar
Chao Liu committed
223
224
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
225
226
227
228
229
230
    }
    else if(offset == 1024)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:1024 \n \
                "
Chao Liu's avatar
Chao Liu committed
231
232
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
233
234
235
236
237
238
    }
    else if(offset == 1152)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:1152 \n \
                "
Chao Liu's avatar
Chao Liu committed
239
240
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
241
242
243
244
245
246
    }
    else if(offset == 1280)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:1280 \n \
                "
Chao Liu's avatar
Chao Liu committed
247
248
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
249
250
251
252
253
254
    }
    else if(offset == 1408)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:1408 \n \
                "
Chao Liu's avatar
Chao Liu committed
255
256
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
257
258
259
260
261
262
    }
    else if(offset == 1536)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:1536 \n \
                "
Chao Liu's avatar
Chao Liu committed
263
264
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
265
266
267
268
269
270
    }
    else if(offset == 1664)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:1664 \n \
                "
Chao Liu's avatar
Chao Liu committed
271
272
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
273
274
275
276
277
278
    }
    else if(offset == 1792)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:1792 \n \
                "
Chao Liu's avatar
Chao Liu committed
279
280
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
281
282
283
284
285
286
    }
    else if(offset == 1920)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:1920 \n \
                "
Chao Liu's avatar
Chao Liu committed
287
288
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
289
290
291
292
293
294
    }
    else if(offset == 2048)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2048 \n \
                "
Chao Liu's avatar
Chao Liu committed
295
296
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
297
298
299
300
301
302
    }
    else if(offset == 2176)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2176 \n \
                "
Chao Liu's avatar
Chao Liu committed
303
304
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
305
306
307
308
309
310
    }
    else if(offset == 2304)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2304 \n \
                "
Chao Liu's avatar
Chao Liu committed
311
312
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
313
314
315
316
317
318
    }
    else if(offset == 2560)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2560 \n \
                "
Chao Liu's avatar
Chao Liu committed
319
320
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
321
322
323
324
325
326
    }
    else if(offset == 2816)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2816 \n \
                "
Chao Liu's avatar
Chao Liu committed
327
328
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
329
330
331
332
333
334
    }
    else if(offset == 3072)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3072 \n \
                "
Chao Liu's avatar
Chao Liu committed
335
336
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
337
338
339
340
341
342
    }
    else if(offset == 3328)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3328 \n \
                "
Chao Liu's avatar
Chao Liu committed
343
344
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
345
346
347
348
349
350
    }
    else if(offset == 3584)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3584 \n \
                "
Chao Liu's avatar
Chao Liu committed
351
352
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
353
354
355
356
357
358
    }
    else if(offset == 3840)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3840 \n \
                "
Chao Liu's avatar
Chao Liu committed
359
360
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
361
362
363
364
365
366
    }
    else if(offset == 4096)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:4096 \n \
                "
Chao Liu's avatar
Chao Liu committed
367
368
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
369
370
371
372
373
374
    }
    else if(offset == 4352)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:4352 \n \
                "
Chao Liu's avatar
Chao Liu committed
375
376
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
377
378
379
    }
    else
    {
380
        assert(false);
Jing Zhang's avatar
Jing Zhang committed
381
    }
Jing Zhang's avatar
Jing Zhang committed
382
#endif
Jing Zhang's avatar
Jing Zhang committed
383
}
Jing Zhang's avatar
Jing Zhang committed
384

Jing Zhang's avatar
Jing Zhang committed
385
386
387
inline __device__ void global_store(data4_t& r,
                            const void* vptr,
                            const void* sprt = 0)
388
{
Jing Zhang's avatar
Jing Zhang committed
389
#if !NO_GLB_READ
Jing Zhang's avatar
Jing Zhang committed
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
    if(sprt == 0)
    {
        asm volatile("\n \
                global_store_dwordx4 %0, %1, off \n \
                "
                :: "v"(vptr), "v"(r));
    }
    else
    {
        asm volatile("\n \
                global_store_dwordx4 %0, %1, %2 \n \
                "
                :: "v"(vptr), "v"(r), "s"(sprt));
    }
#endif
}



Jing Zhang's avatar
Jing Zhang committed
409
inline __device__ void global_load(data4_t& r,
Jing Zhang's avatar
Jing Zhang committed
410
411
                            const void* vptr,
                            const void* sprt = 0)
412
{
Jing Zhang's avatar
Jing Zhang committed
413
#if !NO_GLB_READ
Jing Zhang's avatar
Jing Zhang committed
414
    if(sprt == 0)
415
416
417
418
    {
        asm volatile("\n \
                global_load_dwordx4 %0, %1, off \n \
                "
Jing Zhang's avatar
Jing Zhang committed
419
420
                : "=v"(r)
                : "v"(vptr));
421
422
423
    }
    else
    {
Jing Zhang's avatar
Jing Zhang committed
424
425
426
427
428
            asm volatile("\n \
                    global_load_dwordx4 %0, %1, %2 \n \
                    "
                    : "=v"(r)
                    : "v"(vptr), "s"(sprt));
429
    }
Jing Zhang's avatar
Jing Zhang committed
430
#endif
Jing Zhang's avatar
Jing Zhang committed
431
432
}

Jing Zhang's avatar
Jing Zhang committed
433
434
inline __device__ void
ds_write_b128(const data4_t& r, void* lds, index_t offset = 0)
Jing Zhang's avatar
Jing Zhang committed
435
{
Jing Zhang's avatar
Jing Zhang committed
436
#if !NO_DS_WRITE
437
438
439
    if(offset == 0)
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
440
441
            ds_write_b128 %0, %1 \n \
            "
442
443
444
445
446
447
448
                     :
                     : "v"(__to_local(lds)), "v"(r));
    }
    else
    {
        assert(false);
    }
Jing Zhang's avatar
Jing Zhang committed
449
#endif
Jing Zhang's avatar
Jing Zhang committed
450
}