amd_inline_asm.hpp 17.9 KB
Newer Older
1
2
3
#ifndef CK_AMD_INLINE_ASM_HPP
#define CK_AMD_INLINE_ASM_HPP

4
#include "float_type.hpp"
Jing Zhang's avatar
Jing Zhang committed
5

6
7
namespace ck {

8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
// c += inner_product(a, b)
__device__ void amd_assembly_inner_product(const float& a, const float& b, float& c)
{
#if CK_USE_AMD_V_FMAC_F32
    asm volatile("\n \
            v_fmac_f32 %0, %1, %2 \n \
            "
                 : "=v"(c)
                 : "v"(a), "v"(b), "0"(c));
#else
    asm volatile("\n \
            v_mac_f32 %0, %1, %2 \n \
            "
                 : "=v"(c)
                 : "v"(a), "v"(b), "0"(c));
#endif
}

__device__ void amd_assembly_inner_product(const int8x4_t& a, const int8x4_t& b, int32_t& c)
{
#if 1
    asm volatile("\n \
            v_dot4_i32_i8 %0, %1, %2, %0\n \
            "
                 : "=v"(c)
                 : "v"(as_type<int32_t>(a)), "v"(as_type<int32_t>(b)), "0"(c));
#else
    c = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b), c, false);
#endif
}

__device__ void amd_assembly_inner_product(const int8x8_t& a, const int8x8_t& b, int32_t& c)
{
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};

    amd_assembly_inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
                               vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
                               c);

    amd_assembly_inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
                               vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
                               c);
}

__device__ void amd_assembly_inner_product(const int8x16_t& a, const int8x16_t& b, int32_t& c)
{
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};

    amd_assembly_inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
                               vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
                               c);

    amd_assembly_inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
                               vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
                               c);

    amd_assembly_inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
                               vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
                               c);

    amd_assembly_inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
                               vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
                               c);
}

Chao Liu's avatar
Chao Liu committed
77
78
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
79
__device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
Chao Liu's avatar
Chao Liu committed
80
{
Chao Liu's avatar
Chao Liu committed
81
82
83
84
85
86
87
88
#if CK_USE_AMD_V_FMAC_F32
    asm volatile("\n \
            v_fmac_f32 %0, %2, %3 \n \
            v_fmac_f32 %1, %2, %4 \n \
            "
                 : "=v"(c0), "=v"(c1)
                 : "v"(a), "v"(b0), "v"(b1), "0"(c0), "1"(c1));
#else
89
90
91
92
93
94
    asm volatile("\n \
            v_mac_f32 %0, %2, %3 \n \
            v_mac_f32 %1, %2, %4 \n \
            "
                 : "=v"(c0), "=v"(c1)
                 : "v"(a), "v"(b0), "v"(b1), "0"(c0), "1"(c1));
Chao Liu's avatar
Chao Liu committed
95
#endif
Chao Liu's avatar
Chao Liu committed
96
97
}

Chao Liu's avatar
Chao Liu committed
98
99
100
101
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
// c2 += inner_product(a, b2)
// c3 += inner_product(a, b3)
102
__device__ void amd_assembly_outer_product_1x4(
103
    float a, float b0, float b1, float b2, float b3, float& c0, float& c1, float& c2, float& c3)
Chao Liu's avatar
Chao Liu committed
104
{
Chao Liu's avatar
Chao Liu committed
105
106
107
108
109
110
111
112
113
114
#if CK_USE_AMD_V_FMAC_F32
    asm volatile("\n \
            v_fmac_f32 %0, %4, %5 \n \
            v_fmac_f32 %1, %4, %6 \n \
            v_fmac_f32 %2, %4, %7 \n \
            v_fmac_f32 %3, %4, %8 \n \
            "
                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
                 : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3));
#else
Jing Zhang's avatar
Jing Zhang committed
115
116
117
118
119
120
    asm volatile("\n \
            v_mac_f32 %0, %4, %5 \n \
            v_mac_f32 %1, %4, %6 \n \
            v_mac_f32 %2, %4, %7 \n \
            v_mac_f32 %3, %4, %8 \n \
            "
121
122
                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
                 : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3));
Chao Liu's avatar
Chao Liu committed
123
#endif
Jing Zhang's avatar
Jing Zhang committed
124
125
}

Chao Liu's avatar
Chao Liu committed
126
127
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
128
129
__device__ void
amd_assembly_outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0, float& c1)
Chao Liu's avatar
Chao Liu committed
130
{
131
    asm volatile("\n \
132
133
            v_dot2_f32_f16 %0, %2, %3, %0\n \
            v_dot2_f32_f16 %1, %2, %4, %1\n \
134
            "
Chao Liu's avatar
Chao Liu committed
135
136
                 : "=v"(c0), "=v"(c1)
                 : "v"(a), "v"(b0), "v"(b1), "0"(c0), "1"(c1));
Jing Zhang's avatar
Jing Zhang committed
137
138
}

Chao Liu's avatar
Chao Liu committed
139
140
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
141
142
__device__ void
amd_assembly_outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0, float& c1)
Chao Liu's avatar
Chao Liu committed
143
{
144
    // TODO remove pointer casting
145
146
147
    const half2_t* p_a_half2  = reinterpret_cast<const half2_t*>(&a);
    const half2_t* p_b0_half2 = reinterpret_cast<const half2_t*>(&b0);
    const half2_t* p_b1_half2 = reinterpret_cast<const half2_t*>(&b1);
Chao Liu's avatar
Chao Liu committed
148

149
150
    // do dot2 two times
    asm volatile("\n \
151
152
153
154
            v_dot2_f32_f16 %0, %2, %4, %0\n \
            v_dot2_f32_f16 %1, %2, %6, %1\n \
            v_dot2_f32_f16 %0, %3, %5, %0\n \
            v_dot2_f32_f16 %1, %3, %7, %1\n \
155
            "
Chao Liu's avatar
Chao Liu committed
156
                 : "=v"(c0), "=v"(c1)
157
                 : "v"(p_a_half2[0]),
Chao Liu's avatar
Chao Liu committed
158
                   "v"(p_a_half2[1]),
159
160
161
                   "v"(p_b0_half2[0]),
                   "v"(p_b0_half2[1]),
                   "v"(p_b1_half2[0]),
Chao Liu's avatar
Chao Liu committed
162
                   "v"(p_b1_half2[1]),
163
                   "0"(c0),
Chao Liu's avatar
Chao Liu committed
164
                   "1"(c1));
Jing Zhang's avatar
Jing Zhang committed
165
166
}

Chao Liu's avatar
Chao Liu committed
167
168
169
170
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
// c2 += inner_product(a, b2)
// c3 += inner_product(a, b3)
171
172
173
174
175
176
177
178
179
__device__ void amd_assembly_outer_product_1x4(half2_t a,
                                               half2_t b0,
                                               half2_t b1,
                                               half2_t b2,
                                               half2_t b3,
                                               float& c0,
                                               float& c1,
                                               float& c2,
                                               float& c3)
Jing Zhang's avatar
Jing Zhang committed
180
{
181
    asm volatile("\n \
182
183
184
185
            v_dot2_f32_f16 %0, %4, %5, %0\n \
            v_dot2_f32_f16 %1, %4, %6, %1\n \
            v_dot2_f32_f16 %2, %4, %7, %2\n \
            v_dot2_f32_f16 %3, %4, %8, %3\n \
186
            "
Chao Liu's avatar
Chao Liu committed
187
188
                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
                 : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3));
Jing Zhang's avatar
Jing Zhang committed
189
190
}

Chao Liu's avatar
Chao Liu committed
191
192
193
194
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
// c2 += inner_product(a, b2)
// c3 += inner_product(a, b3)
195
196
197
198
199
200
201
202
203
__device__ void amd_assembly_outer_product_1x4(half4_t a,
                                               half4_t b0,
                                               half4_t b1,
                                               half4_t b2,
                                               half4_t b3,
                                               float& c0,
                                               float& c1,
                                               float& c2,
                                               float& c3)
Jing Zhang's avatar
Jing Zhang committed
204
{
205
    // TODO remove pointer casting
206
207
208
209
210
    const half2_t* p_a_half2  = reinterpret_cast<const half2_t*>(&a);
    const half2_t* p_b0_half2 = reinterpret_cast<const half2_t*>(&b0);
    const half2_t* p_b1_half2 = reinterpret_cast<const half2_t*>(&b1);
    const half2_t* p_b2_half2 = reinterpret_cast<const half2_t*>(&b2);
    const half2_t* p_b3_half2 = reinterpret_cast<const half2_t*>(&b3);
Jing Zhang's avatar
Jing Zhang committed
211

212
213
    // do dot2 two times
    asm volatile("\n \
214
215
216
217
218
219
220
221
            v_dot2_f32_f16 %0, %4, %6,  %0\n \
            v_dot2_f32_f16 %1, %4, %8,  %1\n \
            v_dot2_f32_f16 %2, %4, %10, %2\n \
            v_dot2_f32_f16 %3, %4, %12, %3\n \
            v_dot2_f32_f16 %0, %5, %7,  %0\n \
            v_dot2_f32_f16 %1, %5, %9,  %1\n \
            v_dot2_f32_f16 %2, %5, %11, %2\n \
            v_dot2_f32_f16 %3, %5, %13, %3\n \
Jing Zhang's avatar
Jing Zhang committed
222
            "
Chao Liu's avatar
Chao Liu committed
223
                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
224
                 : "v"(p_a_half2[0]),
Chao Liu's avatar
Chao Liu committed
225
                   "v"(p_a_half2[1]),
226
227
228
                   "v"(p_b0_half2[0]),
                   "v"(p_b0_half2[1]),
                   "v"(p_b1_half2[0]),
Chao Liu's avatar
Chao Liu committed
229
                   "v"(p_b1_half2[1]),
230
231
232
                   "v"(p_b2_half2[0]),
                   "v"(p_b2_half2[1]),
                   "v"(p_b3_half2[0]),
Chao Liu's avatar
Chao Liu committed
233
                   "v"(p_b3_half2[1]),
234
235
236
                   "0"(c0),
                   "1"(c1),
                   "2"(c2),
Chao Liu's avatar
Chao Liu committed
237
238
239
                   "3"(c3));
}

240
241
242
243
244
245
246
247
248
249
250
__device__ void amd_assembly_outer_product_1x4(half8_t a,
                                               half8_t b0,
                                               half8_t b1,
                                               half8_t b2,
                                               half8_t b3,
                                               float& c0,
                                               float& c1,
                                               float& c2,
                                               float& c3)
{

251
    // TODO remove pointer casting
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
    const half4_t* p_a_half4  = reinterpret_cast<const half4_t*>(&a);
    const half4_t* p_b0_half4 = reinterpret_cast<const half4_t*>(&b0);
    const half4_t* p_b1_half4 = reinterpret_cast<const half4_t*>(&b1);
    const half4_t* p_b2_half4 = reinterpret_cast<const half4_t*>(&b2);
    const half4_t* p_b3_half4 = reinterpret_cast<const half4_t*>(&b3);

    amd_assembly_outer_product_1x4(
        p_a_half4[0], p_b0_half4[0], p_b1_half4[0], p_b2_half4[0], p_b3_half4[0], c0, c1, c2, c3);

    amd_assembly_outer_product_1x4(
        p_a_half4[1], p_b0_half4[1], p_b1_half4[1], p_b2_half4[1], p_b3_half4[1], c0, c1, c2, c3);
}

__device__ void amd_assembly_outer_product_1x4(half16_t a,
                                               half16_t b0,
                                               half16_t b1,
                                               half16_t b2,
                                               half16_t b3,
                                               float& c0,
                                               float& c1,
                                               float& c2,
                                               float& c3)
{
275
    // TODO remove pointer casting
276
277
278
279
280
281
282
283
284
285
286
287
288
    const half8_t* p_a_half8  = reinterpret_cast<const half8_t*>(&a);
    const half8_t* p_b0_half8 = reinterpret_cast<const half8_t*>(&b0);
    const half8_t* p_b1_half8 = reinterpret_cast<const half8_t*>(&b1);
    const half8_t* p_b2_half8 = reinterpret_cast<const half8_t*>(&b2);
    const half8_t* p_b3_half8 = reinterpret_cast<const half8_t*>(&b3);

    amd_assembly_outer_product_1x4(
        p_a_half8[0], p_b0_half8[0], p_b1_half8[0], p_b2_half8[0], p_b3_half8[0], c0, c1, c2, c3);

    amd_assembly_outer_product_1x4(
        p_a_half8[1], p_b0_half8[1], p_b1_half8[1], p_b2_half8[1], p_b3_half8[1], c0, c1, c2, c3);
}

Chao Liu's avatar
Chao Liu committed
289
290
291
292
293
294
295
296
297
298
299
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
__device__ void
amd_assembly_outer_product_1x2(int8x4_t a, int8x4_t b0, int8x4_t b1, int32_t& c0, int32_t& c1)
{
#if 1
    asm volatile("\n \
            v_dot4_i32_i8 %0, %2, %3, %0\n \
            v_dot4_i32_i8 %1, %2, %4, %1\n \
            "
                 : "=v"(c0), "=v"(c1)
300
301
302
303
304
                 : "v"(as_type<int32_t>(a)),
                   "v"(as_type<int32_t>(b0)),
                   "v"(as_type<int32_t>(b1)),
                   "0"(c0),
                   "1"(c1));
Chao Liu's avatar
Chao Liu committed
305
#else
306
307
    c0 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b0), c0, false);
    c1 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b1), c1, false);
Chao Liu's avatar
Chao Liu committed
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
#endif
}

// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
// c2 += inner_product(a, b2)
// c3 += inner_product(a, b3)
__device__ void amd_assembly_outer_product_1x4(int8x4_t a,
                                               int8x4_t b0,
                                               int8x4_t b1,
                                               int8x4_t b2,
                                               int8x4_t b3,
                                               int32_t& c0,
                                               int32_t& c1,
                                               int32_t& c2,
                                               int32_t& c3)
{
#if 1
    asm volatile("\n \
            v_dot4_i32_i8 %0, %4, %5, %0\n \
            v_dot4_i32_i8 %1, %4, %6, %1\n \
            v_dot4_i32_i8 %2, %4, %7, %2\n \
            v_dot4_i32_i8 %3, %4, %8, %3\n \
            "
                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
333
334
335
336
337
338
339
340
341
                 : "v"(as_type<int32_t>(a)),
                   "v"(as_type<int32_t>(b0)),
                   "v"(as_type<int32_t>(b1)),
                   "v"(as_type<int32_t>(b2)),
                   "v"(as_type<int32_t>(b3)),
                   "0"(c0),
                   "1"(c1),
                   "2"(c2),
                   "3"(c3));
Chao Liu's avatar
Chao Liu committed
342
#else
343
344
345
346
    c0 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b0), c0, false);
    c1 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b1), c1, false);
    c2 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b2), c2, false);
    c3 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b3), c3, false);
Chao Liu's avatar
Chao Liu committed
347
#endif
Jing Zhang's avatar
Jing Zhang committed
348
}
349

350
351
352
353
354
355
356
357
358
359
__device__ void amd_assembly_outer_product_1x4(int8x8_t a,
                                               int8x8_t b0,
                                               int8x8_t b1,
                                               int8x8_t b2,
                                               int8x8_t b3,
                                               int32_t& c0,
                                               int32_t& c1,
                                               int32_t& c2,
                                               int32_t& c3)
{
360
361
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
362

363
364
365
366
367
    amd_assembly_outer_product_1x4(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 8>{b0}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 8>{b1}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 8>{b2}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 8>{b3}.AsType<int8x4_t>()[I0],
368
369
370
371
372
                                   c0,
                                   c1,
                                   c2,
                                   c3);

373
374
375
376
377
    amd_assembly_outer_product_1x4(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 8>{b0}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 8>{b1}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 8>{b2}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 8>{b3}.AsType<int8x4_t>()[I1],
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
                                   c0,
                                   c1,
                                   c2,
                                   c3);
}

__device__ void amd_assembly_outer_product_1x4(int8x16_t a,
                                               int8x16_t b0,
                                               int8x16_t b1,
                                               int8x16_t b2,
                                               int8x16_t b3,
                                               int32_t& c0,
                                               int32_t& c1,
                                               int32_t& c2,
                                               int32_t& c3)

{
395
396
397
398
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
399

400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I0],
                                   c0,
                                   c1,
                                   c2,
                                   c3);

    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I1],
                                   c0,
                                   c1,
                                   c2,
                                   c3);
419

420
421
422
423
424
    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I2],
                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I2],
                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I2],
                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I2],
425
426
427
428
429
                                   c0,
                                   c1,
                                   c2,
                                   c3);

430
431
432
433
434
    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I3],
                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I3],
                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I3],
                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I3],
435
436
437
438
439
440
                                   c0,
                                   c1,
                                   c2,
                                   c3);
}

441
442
} // namespace ck
#endif