"megatron/vscode:/vscode.git/clone" did not exist on "22a3d81ab9ed767718fe940422595b31f27dd20c"
amd_inline_asm.hip.hpp 21.8 KB
Newer Older
Jing Zhang's avatar
Jing Zhang committed
1
#pragma once
Chao Liu's avatar
Chao Liu committed
2
#include "common.hip.hpp"
Jing Zhang's avatar
Jing Zhang committed
3

Jing Zhang's avatar
Jing Zhang committed
4
5
6
7
8
9
#define NO_VM_WAIT 0
#define NO_LGKM_WAIT 0
#define NO_DS_READ 0
#define NO_DS_WRITE 0
#define NO_GLB_READ 0

Chao Liu's avatar
Chao Liu committed
10
// cast a pointer of LDS to its address
Jing Zhang's avatar
add asm  
Jing Zhang committed
11
extern "C" __attribute__((address_space(3))) void* __to_local(const void* p)[[hc]];
Chao Liu's avatar
Chao Liu committed
12
13

__device__ void vmcnt(index_t cnt)
14
{
Jing Zhang's avatar
Jing Zhang committed
15
#if !NO_VM_WAIT
16
17
18
    if(cnt == 0)
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
19
                s_waitcnt vmcnt(0) \n \
20
                " ::);
Jing Zhang's avatar
Jing Zhang committed
21
    }
22
23
24
    else if(cnt == 1)
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
25
                s_waitcnt vmcnt(1) \n \
26
                " ::);
Jing Zhang's avatar
Jing Zhang committed
27
    }
28
29
30
    else if(cnt == 2)
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
31
                s_waitcnt vmcnt(2) \n \
32
                " ::);
Jing Zhang's avatar
Jing Zhang committed
33
    }
34
35
36
    else if(cnt == 4)
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
37
                s_waitcnt vmcnt(2) \n \
38
                " ::);
Jing Zhang's avatar
Jing Zhang committed
39
    }
40
41
    else
    {
42
        assert(false);
Jing Zhang's avatar
Jing Zhang committed
43
    }
Jing Zhang's avatar
Jing Zhang committed
44
#endif
Jing Zhang's avatar
Jing Zhang committed
45
46
}

Chao Liu's avatar
Chao Liu committed
47
__device__ void lgkmcnt(index_t cnt)
Chao Liu's avatar
Chao Liu committed
48
{
Jing Zhang's avatar
Jing Zhang committed
49
#if !NO_LGKM_WAIT
Chao Liu's avatar
Chao Liu committed
50
51
    if(cnt == 0)
    {
Jing Zhang's avatar
Jing Zhang committed
52
53
        asm volatile("\n \
                s_waitcnt lgkmcnt(0) \n \
Chao Liu's avatar
Chao Liu committed
54
                " ::);
Jing Zhang's avatar
Jing Zhang committed
55
    }
Chao Liu's avatar
Chao Liu committed
56
57
    else if(cnt == 1)
    {
Jing Zhang's avatar
Jing Zhang committed
58
59
        asm volatile("\n \
                s_waitcnt lgkmcnt(1) \n \
Chao Liu's avatar
Chao Liu committed
60
                " ::);
Jing Zhang's avatar
Jing Zhang committed
61
    }
Chao Liu's avatar
Chao Liu committed
62
63
    else if(cnt == 2)
    {
Jing Zhang's avatar
Jing Zhang committed
64
65
        asm volatile("\n \
                s_waitcnt lgkmcnt(2) \n \
Chao Liu's avatar
Chao Liu committed
66
                " ::);
Jing Zhang's avatar
Jing Zhang committed
67
    }
Chao Liu's avatar
Chao Liu committed
68
69
    else if(cnt == 3)
    {
Jing Zhang's avatar
Jing Zhang committed
70
71
        asm volatile("\n \
                s_waitcnt lgkmcnt(3) \n \
Chao Liu's avatar
Chao Liu committed
72
                " ::);
Jing Zhang's avatar
Jing Zhang committed
73
    }
Chao Liu's avatar
Chao Liu committed
74
75
    else if(cnt == 4)
    {
Jing Zhang's avatar
Jing Zhang committed
76
77
        asm volatile("\n \
                s_waitcnt lgkmcnt(4) \n \
Chao Liu's avatar
Chao Liu committed
78
                " ::);
Jing Zhang's avatar
Jing Zhang committed
79
    }
Chao Liu's avatar
Chao Liu committed
80
81
    else
    {
82
        assert(false);
Jing Zhang's avatar
Jing Zhang committed
83
    }
Jing Zhang's avatar
Jing Zhang committed
84
#endif
Jing Zhang's avatar
Jing Zhang committed
85
86
}

Chao Liu's avatar
Chao Liu committed
87
__device__ void outerProduct1x4(const float* a, const float* b, float* c)
Chao Liu's avatar
Chao Liu committed
88
{
Jing Zhang's avatar
Jing Zhang committed
89
90
91
92
93
94
    asm volatile("\n \
            v_mac_f32 %0, %4, %5 \n \
            v_mac_f32 %1, %4, %6 \n \
            v_mac_f32 %2, %4, %7 \n \
            v_mac_f32 %3, %4, %8 \n \
            "
Chao Liu's avatar
Chao Liu committed
95
96
97
98
99
100
101
102
103
104
                 : "=v"(c[0]), "=v"(c[1]), "=v"(c[2]), "=v"(c[3])
                 : "v"(a[0]),
                   "v"(b[0]),
                   "v"(b[1]),
                   "v"(b[2]),
                   "v"(b[3]),
                   "0"(c[0]),
                   "1"(c[1]),
                   "2"(c[2]),
                   "3"(c[3]));
Jing Zhang's avatar
Jing Zhang committed
105
106
}

Chao Liu's avatar
Chao Liu committed
107
108
109
__device__ void outerProduct1x4(const float& a,
                                const vector_type<float, 4>::MemoryType& b,
                                vector_type<float, 4>::MemoryType& c)
Chao Liu's avatar
Chao Liu committed
110
{
Jing Zhang's avatar
Jing Zhang committed
111
112
113
114
115
116
117
118
119
120
121
122
123
#if 0
    asm volatile(
            "\n \
            v_mac_f32 %0, %4, %5 \n \
            v_mac_f32 %1, %4, %6 \n \
            v_mac_f32 %2, %4, %7 \n \
            v_mac_f32 %3, %4, %8 \n \
            "
            :
            :"v"(c.x),"v"(c.y),"v"(c.z),"v"(c.w), \
            "v"(a.x),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
            );
#else
Chao Liu's avatar
Chao Liu committed
124
    outerProduct1x4(&a, (float*)&b, (float*)&c);
Jing Zhang's avatar
Jing Zhang committed
125
126
127
#endif
}

Chao Liu's avatar
Chao Liu committed
128
129
130
131
132
133
__device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a,
                                const vector_type<float, 4>::MemoryType& b,
                                vector_type<float, 4>::MemoryType& c0,
                                vector_type<float, 4>::MemoryType& c1,
                                vector_type<float, 4>::MemoryType& c2,
                                vector_type<float, 4>::MemoryType& c3)
Chao Liu's avatar
Chao Liu committed
134
{
Jing Zhang's avatar
Jing Zhang committed
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#if 0
    asm volatile(
            "\n \
            v_mac_f32 %0, %4, %5 \n \
            v_mac_f32 %1, %4, %6 \n \
            v_mac_f32 %2, %4, %7 \n \
            v_mac_f32 %3, %4, %8 \n \
            "
            :
            :"v"(c0.x),"v"(c0.y),"v"(c0.z),"v"(c0.w), \
            "v"(a.x),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
            );
    asm volatile(
            "\n \
            v_mac_f32 %0, %4, %5 \n \
            v_mac_f32 %1, %4, %6 \n \
            v_mac_f32 %2, %4, %7 \n \
            v_mac_f32 %3, %4, %8 \n \
            "
            :
            :"v"(c1.x),"v"(c1.y),"v"(c1.z),"v"(c1.w), \
            "v"(a.y),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
            );
    asm volatile(
            "\n \
            v_mac_f32 %0, %4, %5 \n \
            v_mac_f32 %1, %4, %6 \n \
            v_mac_f32 %2, %4, %7 \n \
            v_mac_f32 %3, %4, %8 \n \
            "
            :
            :"v"(c2.x),"v"(c2.y),"v"(c2.z),"v"(c2.w), \
            "v"(a.z),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
            );
    asm volatile(
            "\n \
            v_mac_f32 %0, %4, %5 \n \
            v_mac_f32 %1, %4, %6 \n \
            v_mac_f32 %2, %4, %7 \n \
            v_mac_f32 %3, %4, %8 \n \
            "
            :
            :"v"(c3.x),"v"(c3.y),"v"(c3.z),"v"(c3.w), \
            "v"(a.w),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
            );
#else
    outerProduct1x4(a.x, b, c0);
    outerProduct1x4(a.y, b, c1);
    outerProduct1x4(a.z, b, c2);
    outerProduct1x4(a.w, b, c3);
#endif
}

Chao Liu's avatar
Chao Liu committed
188
189
190
__device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
                                const vector_type<float, 4>::MemoryType* b,
                                vector_type<float, 4>::MemoryType* c)
Jing Zhang's avatar
Jing Zhang committed
191
192
193
194
195
196
197
{
    outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]);
    outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]);
    outerProduct4x4(a[1], b[0], c[8], c[10], c[12], c[14]);
    outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]);
}

Chao Liu's avatar
Chao Liu committed
198
__device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
Jing Zhang's avatar
Jing Zhang committed
199
{
Jing Zhang's avatar
Jing Zhang committed
200
#if !NO_DS_READ
Jing Zhang's avatar
Jing Zhang committed
201
202
203
    if(offset == 0)
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
204
                ds_read_b128 %0, %1 offset:0\n \
Jing Zhang's avatar
Jing Zhang committed
205
                "
Chao Liu's avatar
Chao Liu committed
206
207
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
208
    }
Jing Zhang's avatar
Jing Zhang committed
209
    if(offset == 64)
Jing Zhang's avatar
Jing Zhang committed
210
211
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
212
                ds_read_b128 %0, %1 offset:64\n \
Jing Zhang's avatar
Jing Zhang committed
213
                "
Chao Liu's avatar
Chao Liu committed
214
215
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
216
    }
Jing Zhang's avatar
Jing Zhang committed
217
    if(offset == 128)
Jing Zhang's avatar
Jing Zhang committed
218
219
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
220
                ds_read_b128 %0, %1 offset:128\n \
Jing Zhang's avatar
Jing Zhang committed
221
                "
Chao Liu's avatar
Chao Liu committed
222
223
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
224
    }
Jing Zhang's avatar
Jing Zhang committed
225
    if(offset == 192)
Jing Zhang's avatar
Jing Zhang committed
226
227
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
228
                ds_read_b128 %0, %1 offset:192\n \
Jing Zhang's avatar
Jing Zhang committed
229
                "
Chao Liu's avatar
Chao Liu committed
230
231
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
232
    }
Jing Zhang's avatar
Jing Zhang committed
233
    if(offset == 256)
Jing Zhang's avatar
Jing Zhang committed
234
235
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
236
                ds_read_b128 %0, %1 offset:256\n \
Jing Zhang's avatar
Jing Zhang committed
237
                "
Chao Liu's avatar
Chao Liu committed
238
239
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
240
    }
Jing Zhang's avatar
Jing Zhang committed
241
    if(offset == 320)
Jing Zhang's avatar
Jing Zhang committed
242
243
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
244
                ds_read_b128 %0, %1 offset:320\n \
Jing Zhang's avatar
Jing Zhang committed
245
                "
Chao Liu's avatar
Chao Liu committed
246
247
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
248
    }
Jing Zhang's avatar
Jing Zhang committed
249
    if(offset == 384)
Jing Zhang's avatar
Jing Zhang committed
250
251
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
252
                ds_read_b128 %0, %1 offset:384\n \
Jing Zhang's avatar
Jing Zhang committed
253
                "
Chao Liu's avatar
Chao Liu committed
254
255
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
256
    }
Jing Zhang's avatar
Jing Zhang committed
257
    if(offset == 448)
Jing Zhang's avatar
Jing Zhang committed
258
259
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
260
                ds_read_b128 %0, %1 offset:448\n \
Jing Zhang's avatar
Jing Zhang committed
261
                "
Chao Liu's avatar
Chao Liu committed
262
263
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
264
    }
Jing Zhang's avatar
Jing Zhang committed
265
    if(offset == 512)
Jing Zhang's avatar
Jing Zhang committed
266
267
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
268
                ds_read_b128 %0, %1 offset:512\n \
Jing Zhang's avatar
Jing Zhang committed
269
                "
Chao Liu's avatar
Chao Liu committed
270
271
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
272
    }
Jing Zhang's avatar
Jing Zhang committed
273
    if(offset == 576)
Jing Zhang's avatar
Jing Zhang committed
274
275
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
276
                ds_read_b128 %0, %1 offset:576\n \
Jing Zhang's avatar
Jing Zhang committed
277
                "
Chao Liu's avatar
Chao Liu committed
278
279
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
280
    }
Jing Zhang's avatar
Jing Zhang committed
281
    if(offset == 640)
Jing Zhang's avatar
Jing Zhang committed
282
283
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
284
                ds_read_b128 %0, %1 offset:640\n \
Jing Zhang's avatar
Jing Zhang committed
285
                "
Chao Liu's avatar
Chao Liu committed
286
287
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
288
    }
Jing Zhang's avatar
Jing Zhang committed
289
    if(offset == 704)
Jing Zhang's avatar
Jing Zhang committed
290
291
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
292
                ds_read_b128 %0, %1 offset:704\n \
Jing Zhang's avatar
Jing Zhang committed
293
                "
Chao Liu's avatar
Chao Liu committed
294
295
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
296
    }
Jing Zhang's avatar
Jing Zhang committed
297
    if(offset == 768)
Jing Zhang's avatar
Jing Zhang committed
298
299
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
300
                ds_read_b128 %0, %1 offset:768\n \
Jing Zhang's avatar
Jing Zhang committed
301
                "
Chao Liu's avatar
Chao Liu committed
302
303
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
304
    }
Jing Zhang's avatar
Jing Zhang committed
305
    if(offset == 832)
Jing Zhang's avatar
Jing Zhang committed
306
307
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
308
                ds_read_b128 %0, %1 offset:832\n \
Jing Zhang's avatar
Jing Zhang committed
309
                "
Chao Liu's avatar
Chao Liu committed
310
311
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
312
    }
Jing Zhang's avatar
Jing Zhang committed
313
    if(offset == 896)
Jing Zhang's avatar
Jing Zhang committed
314
315
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
316
                ds_read_b128 %0, %1 offset:896\n \
Jing Zhang's avatar
Jing Zhang committed
317
                "
Chao Liu's avatar
Chao Liu committed
318
319
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
320
    }
Jing Zhang's avatar
Jing Zhang committed
321
    if(offset == 960)
Jing Zhang's avatar
Jing Zhang committed
322
323
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
324
                ds_read_b128 %0, %1 offset:960\n \
Jing Zhang's avatar
Jing Zhang committed
325
                "
Chao Liu's avatar
Chao Liu committed
326
327
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
328
    }
Jing Zhang's avatar
Jing Zhang committed
329
    if(offset == 1024)
Jing Zhang's avatar
Jing Zhang committed
330
331
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
332
                ds_read_b128 %0, %1 offset:1024\n \
Jing Zhang's avatar
Jing Zhang committed
333
                "
Chao Liu's avatar
Chao Liu committed
334
335
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
336
    }
Jing Zhang's avatar
Jing Zhang committed
337
    if(offset == 1088)
Jing Zhang's avatar
Jing Zhang committed
338
339
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
340
                ds_read_b128 %0, %1 offset:1088\n \
Jing Zhang's avatar
Jing Zhang committed
341
                "
Chao Liu's avatar
Chao Liu committed
342
343
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
344
    }
Jing Zhang's avatar
Jing Zhang committed
345
    if(offset == 1152)
Jing Zhang's avatar
Jing Zhang committed
346
347
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
348
                ds_read_b128 %0, %1 offset:1152\n \
Jing Zhang's avatar
Jing Zhang committed
349
                "
Chao Liu's avatar
Chao Liu committed
350
351
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
352
    }
Jing Zhang's avatar
Jing Zhang committed
353
    if(offset == 1216)
Chao Liu's avatar
Chao Liu committed
354
355
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
356
                ds_read_b128 %0, %1 offset:1216\n \
Chao Liu's avatar
Chao Liu committed
357
                "
Chao Liu's avatar
Chao Liu committed
358
359
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
360
    }
Jing Zhang's avatar
Jing Zhang committed
361
    if(offset == 1280)
Jing Zhang's avatar
Jing Zhang committed
362
363
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
364
                ds_read_b128 %0, %1 offset:1280\n \
Jing Zhang's avatar
Jing Zhang committed
365
                "
Chao Liu's avatar
Chao Liu committed
366
367
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
368
    }
Jing Zhang's avatar
Jing Zhang committed
369
    if(offset == 1344)
Chao Liu's avatar
Chao Liu committed
370
371
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
372
                ds_read_b128 %0, %1 offset:1344\n \
Chao Liu's avatar
Chao Liu committed
373
                "
Chao Liu's avatar
Chao Liu committed
374
375
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
376
    }
Jing Zhang's avatar
Jing Zhang committed
377
    if(offset == 1408)
Jing Zhang's avatar
Jing Zhang committed
378
379
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
380
                ds_read_b128 %0, %1 offset:1408\n \
Jing Zhang's avatar
Jing Zhang committed
381
                "
Chao Liu's avatar
Chao Liu committed
382
383
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
384
    }
Jing Zhang's avatar
Jing Zhang committed
385
    if(offset == 1472)
Chao Liu's avatar
Chao Liu committed
386
387
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
388
                ds_read_b128 %0, %1 offset:1472\n \
Chao Liu's avatar
Chao Liu committed
389
                "
Chao Liu's avatar
Chao Liu committed
390
391
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
392
    }
Jing Zhang's avatar
Jing Zhang committed
393
    if(offset == 1536)
Jing Zhang's avatar
Jing Zhang committed
394
395
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
396
                ds_read_b128 %0, %1 offset:1536\n \
Jing Zhang's avatar
Jing Zhang committed
397
                "
Chao Liu's avatar
Chao Liu committed
398
399
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
400
    }
Jing Zhang's avatar
Jing Zhang committed
401
    if(offset == 1600)
Chao Liu's avatar
Chao Liu committed
402
403
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
404
                ds_read_b128 %0, %1 offset:1600\n \
Chao Liu's avatar
Chao Liu committed
405
                "
Chao Liu's avatar
Chao Liu committed
406
407
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
408
    }
Jing Zhang's avatar
Jing Zhang committed
409
    if(offset == 1664)
Jing Zhang's avatar
Jing Zhang committed
410
411
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
412
                ds_read_b128 %0, %1 offset:1664\n \
Jing Zhang's avatar
Jing Zhang committed
413
                "
Chao Liu's avatar
Chao Liu committed
414
415
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
416
    }
Jing Zhang's avatar
Jing Zhang committed
417
    if(offset == 1728)
Chao Liu's avatar
Chao Liu committed
418
419
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
420
                ds_read_b128 %0, %1 offset:1728\n \
Chao Liu's avatar
Chao Liu committed
421
                "
Chao Liu's avatar
Chao Liu committed
422
423
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
424
    }
Jing Zhang's avatar
Jing Zhang committed
425
    if(offset == 1792)
Jing Zhang's avatar
Jing Zhang committed
426
427
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
428
                ds_read_b128 %0, %1 offset:1792\n \
Jing Zhang's avatar
Jing Zhang committed
429
                "
Chao Liu's avatar
Chao Liu committed
430
431
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
432
    }
Jing Zhang's avatar
Jing Zhang committed
433
    if(offset == 1856)
Chao Liu's avatar
Chao Liu committed
434
435
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
436
                ds_read_b128 %0, %1 offset:1856\n \
Chao Liu's avatar
Chao Liu committed
437
                "
Chao Liu's avatar
Chao Liu committed
438
439
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
440
    }
Jing Zhang's avatar
Jing Zhang committed
441
    if(offset == 1920)
Jing Zhang's avatar
Jing Zhang committed
442
443
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
444
                ds_read_b128 %0, %1 offset:1920\n \
Jing Zhang's avatar
Jing Zhang committed
445
                "
Chao Liu's avatar
Chao Liu committed
446
447
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
448
    }
Jing Zhang's avatar
Jing Zhang committed
449
    if(offset == 1984)
Chao Liu's avatar
Chao Liu committed
450
451
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
452
                ds_read_b128 %0, %1 offset:1984\n \
Chao Liu's avatar
Chao Liu committed
453
                "
Chao Liu's avatar
Chao Liu committed
454
455
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Chao Liu's avatar
Chao Liu committed
456
    }
Jing Zhang's avatar
Jing Zhang committed
457
    if(offset == 2048)
Jing Zhang's avatar
Jing Zhang committed
458
459
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
460
                ds_read_b128 %0, %1 offset:2048\n \
Jing Zhang's avatar
Jing Zhang committed
461
                "
Chao Liu's avatar
Chao Liu committed
462
463
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
464
    }
Jing Zhang's avatar
Jing Zhang committed
465
    if(offset == 2112)
Jing Zhang's avatar
Jing Zhang committed
466
467
    {
        asm volatile("\n \
Jing Zhang's avatar
Jing Zhang committed
468
                ds_read_b128 %0, %1 offset:2112\n \
Jing Zhang's avatar
Jing Zhang committed
469
                "
Chao Liu's avatar
Chao Liu committed
470
471
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
472
    }
Jing Zhang's avatar
Jing Zhang committed
473
    if(offset == 2176)
Jing Zhang's avatar
Jing Zhang committed
474
    {
Jing Zhang's avatar
Jing Zhang committed
475
476
477
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2176\n \
                "
Chao Liu's avatar
Chao Liu committed
478
479
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
480
481
482
483
484
485
    }
    if(offset == 2240)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2240\n \
                "
Chao Liu's avatar
Chao Liu committed
486
487
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
488
489
490
491
492
493
    }
    if(offset == 2304)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2304\n \
                "
Chao Liu's avatar
Chao Liu committed
494
495
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
496
497
498
499
500
501
    }
    if(offset == 2368)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2368\n \
                "
Chao Liu's avatar
Chao Liu committed
502
503
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
504
505
506
507
508
509
    }
    if(offset == 2432)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2432\n \
                "
Chao Liu's avatar
Chao Liu committed
510
511
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
512
513
514
515
516
517
    }
    if(offset == 2496)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2496\n \
                "
Chao Liu's avatar
Chao Liu committed
518
519
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
520
521
522
523
524
525
    }
    if(offset == 2560)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2560\n \
                "
Chao Liu's avatar
Chao Liu committed
526
527
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
528
529
530
531
532
533
    }
    if(offset == 2624)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2624\n \
                "
Chao Liu's avatar
Chao Liu committed
534
535
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
536
537
538
539
540
541
    }
    if(offset == 2688)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2688\n \
                "
Chao Liu's avatar
Chao Liu committed
542
543
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
544
545
546
547
548
549
    }
    if(offset == 2752)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2752\n \
                "
Chao Liu's avatar
Chao Liu committed
550
551
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
552
553
554
555
556
557
    }
    if(offset == 2816)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2816\n \
                "
Chao Liu's avatar
Chao Liu committed
558
559
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
560
561
562
563
564
565
    }
    if(offset == 2880)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2880\n \
                "
Chao Liu's avatar
Chao Liu committed
566
567
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
568
569
570
571
572
573
    }
    if(offset == 2944)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:2944\n \
                "
Chao Liu's avatar
Chao Liu committed
574
575
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
576
577
578
579
580
581
    }
    if(offset == 3008)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3008\n \
                "
Chao Liu's avatar
Chao Liu committed
582
583
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
584
585
586
587
588
589
    }
    if(offset == 3072)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3072\n \
                "
Chao Liu's avatar
Chao Liu committed
590
591
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
592
593
594
595
596
597
    }
    if(offset == 3136)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3136\n \
                "
Chao Liu's avatar
Chao Liu committed
598
599
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
600
601
602
603
604
605
    }
    if(offset == 3200)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3200\n \
                "
Chao Liu's avatar
Chao Liu committed
606
607
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
608
609
610
611
612
613
    }
    if(offset == 3264)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3264\n \
                "
Chao Liu's avatar
Chao Liu committed
614
615
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
616
617
618
619
620
621
    }
    if(offset == 3328)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3328\n \
                "
Chao Liu's avatar
Chao Liu committed
622
623
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
624
625
626
627
628
629
    }
    if(offset == 3392)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3392\n \
                "
Chao Liu's avatar
Chao Liu committed
630
631
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
632
633
634
635
636
637
    }
    if(offset == 3456)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3456\n \
                "
Chao Liu's avatar
Chao Liu committed
638
639
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
640
641
642
643
644
645
    }
    if(offset == 3520)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3520\n \
                "
Chao Liu's avatar
Chao Liu committed
646
647
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
648
649
650
651
652
653
    }
    if(offset == 3584)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3584\n \
                "
Chao Liu's avatar
Chao Liu committed
654
655
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
656
657
658
659
660
661
    }
    if(offset == 3648)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3648\n \
                "
Chao Liu's avatar
Chao Liu committed
662
663
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
664
665
666
667
668
669
    }
    if(offset == 3712)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3712\n \
                "
Chao Liu's avatar
Chao Liu committed
670
671
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
672
673
674
675
676
677
    }
    if(offset == 3776)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3776\n \
                "
Chao Liu's avatar
Chao Liu committed
678
679
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
680
681
682
683
684
685
    }
    if(offset == 3840)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3840\n \
                "
Chao Liu's avatar
Chao Liu committed
686
687
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
688
689
690
691
692
693
    }
    if(offset == 3904)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3904\n \
                "
Chao Liu's avatar
Chao Liu committed
694
695
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
696
697
698
699
700
701
    }
    if(offset == 3968)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:3968\n \
                "
Chao Liu's avatar
Chao Liu committed
702
703
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
704
705
706
707
708
709
    }
    if(offset == 4032)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:4032\n \
                "
Chao Liu's avatar
Chao Liu committed
710
711
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
712
713
714
715
716
717
    }
    if(offset == 4096)
    {
        asm volatile("\n \
                ds_read_b128 %0, %1 offset:4096\n \
                "
Chao Liu's avatar
Chao Liu committed
718
719
                     : "=v"(r)
                     : "v"(__to_local(lds)));
Jing Zhang's avatar
Jing Zhang committed
720
    }
Jing Zhang's avatar
Jing Zhang committed
721
#endif
Jing Zhang's avatar
Jing Zhang committed
722
}
Jing Zhang's avatar
Jing Zhang committed
723

Jing Zhang's avatar
add asm  
Jing Zhang committed
724
__device__ void global_loadx4(void* r, const void* ptr, index_t offset = 0)
725
{
Jing Zhang's avatar
Jing Zhang committed
726
#if !NO_GLB_READ
727
728
    if(offset == 0)
    {
Jing Zhang's avatar
add asm  
Jing Zhang committed
729
        //*(vector_type<float, 4>::MemoryType*)(r) = *(vector_type<float, 4>::MemoryType*)(ptr);
730
731
732
        asm volatile("\n \
                global_load_dwordx4 %0, %1, off \n \
                "
Jing Zhang's avatar
add asm  
Jing Zhang committed
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
                     : "=v"(*(vector_type<float, 4>::MemoryType*)(r))
                     : "r"(ptr));
    }
    else
    {
        assert(false);
    }
#endif
}

__device__ void global_loadx2(void* r, const void* ptr, index_t offset = 0)
{
#if !NO_GLB_READ
    if(offset == 0)
    {
        asm volatile("\n \
                global_load_dwordx2 %0, %1, off \n \
                "
                     : "=v"(*(vector_type<float, 2>::MemoryType*)(r))
                     : "r"(ptr));
    }
    else
    {
        assert(false);
    }
#endif
}

__device__ void global_loadx1(void* r, const void* ptr, index_t offset = 0)
{
#if !NO_GLB_READ
    if(offset == 0)
    {
        //*(float*)(r) = *(float*)(ptr);
        asm volatile("\n \
                global_load_dword %0, %1, off \n \
                "
                     : "=v"(*(float*)(r))
                     : "r"(ptr));
772
773
774
775
776
    }
    else
    {
        assert(false);
    }
Jing Zhang's avatar
Jing Zhang committed
777
#endif
Jing Zhang's avatar
Jing Zhang committed
778
779
}

Jing Zhang's avatar
add asm  
Jing Zhang committed
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
__device__ void global_storex4(const void* ptr, const void* r, index_t offset = 0)
{
#if !NO_GLB_READ
    if(offset == 0)
    {
        //*(vector_type<float, 4>::MemoryType*)(ptr) = *(vector_type<float, 4>::MemoryType*)(r);
        asm volatile("\n \
                global_store_dwordx4 %0, %1, off \n \
                "
                     :
                     : "r"(ptr), "v"(*(vector_type<float, 4>::MemoryType*)(r)));
    }
    else
    {
        assert(false);
    }
#endif
}

__device__ void global_storex2(const void* ptr, const void* r, index_t offset = 0)
{
#if !NO_GLB_READ
    if(offset == 0)
    {
        asm volatile("\n \
                global_store_dwordx2 %0, %1, off \n \
                "
                     :
                     : "r"(ptr), "v"(*(vector_type<float, 2>::MemoryType*)(r)));
    }
    else
    {
        assert(false);
    }
#endif
}

__device__ void global_storex1(const void* ptr, const void* r, index_t offset = 0)
{
#if !NO_GLB_READ
    if(offset == 0)
    {
        //*(float*)(ptr) = *(float*)(r);
        asm volatile("\n \
                global_store_dword %0, %1, off \n \
                "
                     :
                     : "r"(ptr), "v"(*(float*)(r)));
    }
    else
    {
        assert(false);
    }
#endif
}

__device__ void ds_write_b128(const void* lds, const void* r, index_t offset = 0)
Jing Zhang's avatar
Jing Zhang committed
837
{
Jing Zhang's avatar
Jing Zhang committed
838
#if !NO_DS_WRITE
839
840
    if(offset == 0)
    {
Jing Zhang's avatar
add asm  
Jing Zhang committed
841
        //*(vector_type<float, 4>::MemoryType*)(lds) = *(vector_type<float, 4>::MemoryType*)(r);
842
        asm volatile("\n \
Jing Zhang's avatar
add asm  
Jing Zhang committed
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
                ds_write_b128 %0, %1 \n \
                "
                     :
                     : "v"(__to_local(lds)), "v"(*(vector_type<float, 4>::MemoryType*)(r)));
    }
    else
    {
        assert(false);
    }
#endif
}

__device__ void ds_write_b32(const void* lds, const void* r, index_t offset = 0)
{
#if !NO_DS_WRITE
    if(offset == 0)
    {
        //*(float*)(lds) = *(float*)(r);
        asm volatile("\n \
                ds_write_b32 %0, %1 \n \
                "
864
                     :
Jing Zhang's avatar
add asm  
Jing Zhang committed
865
                     : "v"(__to_local(lds)), "v"(*(float*)(r)));
866
867
868
869
870
    }
    else
    {
        assert(false);
    }
Jing Zhang's avatar
Jing Zhang committed
871
#endif
Jing Zhang's avatar
Jing Zhang committed
872
}