amd_inline_asm.hpp 15.5 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
// SPDX-License-Identifier: MIT
Illia Silin's avatar
Illia Silin committed
2
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
Chao Liu's avatar
Chao Liu committed
3

4
5
6
#ifndef CK_AMD_INLINE_ASM_HPP
#define CK_AMD_INLINE_ASM_HPP

7
#include "data_type.hpp"
8
#include "c_style_pointer_cast.hpp"
Jing Zhang's avatar
Jing Zhang committed
9

10
11
// TODO: deprecate all amd_assembly_outer_product_xxx

12
13
namespace ck {

Jing Zhang's avatar
Jing Zhang committed
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

inline __device__ half2_t amd_assembly_pk_fma_f16(half2_t a, half2_t b, half2_t c) {
  half2_t d;
  asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n"
		  : "=v"(d)
		  : "v"(a), "v"(b), "v"(c));
  return d;
}

inline __device__ half2_t amd_assembly_pk_add_f16(half2_t a, half2_t b) {
	half2_t c;
	asm volatile("v_pk_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
	return c;
}

Chao Liu's avatar
Chao Liu committed
29
30
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
31
__device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
Chao Liu's avatar
Chao Liu committed
32
{
Chao Liu's avatar
Chao Liu committed
33
34
35
36
37
38
    asm volatile("\n \
            v_fmac_f32 %0, %2, %3 \n \
            v_fmac_f32 %1, %2, %4 \n \
            "
                 : "=v"(c0), "=v"(c1)
                 : "v"(a), "v"(b0), "v"(b1), "0"(c0), "1"(c1));
Chao Liu's avatar
Chao Liu committed
39
40
}

Chao Liu's avatar
Chao Liu committed
41
42
43
44
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
// c2 += inner_product(a, b2)
// c3 += inner_product(a, b3)
45
__device__ void amd_assembly_outer_product_1x4(
46
    float a, float b0, float b1, float b2, float b3, float& c0, float& c1, float& c2, float& c3)
Chao Liu's avatar
Chao Liu committed
47
{
Chao Liu's avatar
Chao Liu committed
48
49
50
51
52
53
54
55
    asm volatile("\n \
            v_fmac_f32 %0, %4, %5 \n \
            v_fmac_f32 %1, %4, %6 \n \
            v_fmac_f32 %2, %4, %7 \n \
            v_fmac_f32 %3, %4, %8 \n \
            "
                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
                 : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3));
Jing Zhang's avatar
Jing Zhang committed
56
57
}

Chao Liu's avatar
Chao Liu committed
58
59
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
60
61
__device__ void
amd_assembly_outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0, float& c1)
Chao Liu's avatar
Chao Liu committed
62
{
63
    asm volatile("\n \
64
65
            v_dot2_f32_f16 %0, %2, %3, %0\n \
            v_dot2_f32_f16 %1, %2, %4, %1\n \
66
            "
Chao Liu's avatar
Chao Liu committed
67
68
                 : "=v"(c0), "=v"(c1)
                 : "v"(a), "v"(b0), "v"(b1), "0"(c0), "1"(c1));
Jing Zhang's avatar
Jing Zhang committed
69
70
}

Chao Liu's avatar
Chao Liu committed
71
72
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
73
74
__device__ void
amd_assembly_outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0, float& c1)
Chao Liu's avatar
Chao Liu committed
75
{
76
    // TODO remove pointer casting
77
78
79
    const half2_t* p_a_half2  = c_style_pointer_cast<const half2_t*>(&a);
    const half2_t* p_b0_half2 = c_style_pointer_cast<const half2_t*>(&b0);
    const half2_t* p_b1_half2 = c_style_pointer_cast<const half2_t*>(&b1);
Chao Liu's avatar
Chao Liu committed
80

81
82
    // do dot2 two times
    asm volatile("\n \
83
84
85
86
            v_dot2_f32_f16 %0, %2, %4, %0\n \
            v_dot2_f32_f16 %1, %2, %6, %1\n \
            v_dot2_f32_f16 %0, %3, %5, %0\n \
            v_dot2_f32_f16 %1, %3, %7, %1\n \
87
            "
Chao Liu's avatar
Chao Liu committed
88
                 : "=v"(c0), "=v"(c1)
89
                 : "v"(p_a_half2[0]),
Chao Liu's avatar
Chao Liu committed
90
                   "v"(p_a_half2[1]),
91
92
93
                   "v"(p_b0_half2[0]),
                   "v"(p_b0_half2[1]),
                   "v"(p_b1_half2[0]),
Chao Liu's avatar
Chao Liu committed
94
                   "v"(p_b1_half2[1]),
95
                   "0"(c0),
Chao Liu's avatar
Chao Liu committed
96
                   "1"(c1));
Jing Zhang's avatar
Jing Zhang committed
97
98
}

Chao Liu's avatar
Chao Liu committed
99
100
101
102
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
// c2 += inner_product(a, b2)
// c3 += inner_product(a, b3)
103
104
105
106
107
108
109
110
111
__device__ void amd_assembly_outer_product_1x4(half2_t a,
                                               half2_t b0,
                                               half2_t b1,
                                               half2_t b2,
                                               half2_t b3,
                                               float& c0,
                                               float& c1,
                                               float& c2,
                                               float& c3)
Jing Zhang's avatar
Jing Zhang committed
112
{
113
    asm volatile("\n \
114
115
116
117
            v_dot2_f32_f16 %0, %4, %5, %0\n \
            v_dot2_f32_f16 %1, %4, %6, %1\n \
            v_dot2_f32_f16 %2, %4, %7, %2\n \
            v_dot2_f32_f16 %3, %4, %8, %3\n \
118
            "
Chao Liu's avatar
Chao Liu committed
119
120
                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
                 : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3));
Jing Zhang's avatar
Jing Zhang committed
121
122
}

Chao Liu's avatar
Chao Liu committed
123
124
125
126
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
// c2 += inner_product(a, b2)
// c3 += inner_product(a, b3)
127
128
129
130
131
132
133
134
135
__device__ void amd_assembly_outer_product_1x4(half4_t a,
                                               half4_t b0,
                                               half4_t b1,
                                               half4_t b2,
                                               half4_t b3,
                                               float& c0,
                                               float& c1,
                                               float& c2,
                                               float& c3)
Jing Zhang's avatar
Jing Zhang committed
136
{
137
    // TODO remove pointer casting
138
139
140
141
142
    const half2_t* p_a_half2  = c_style_pointer_cast<const half2_t*>(&a);
    const half2_t* p_b0_half2 = c_style_pointer_cast<const half2_t*>(&b0);
    const half2_t* p_b1_half2 = c_style_pointer_cast<const half2_t*>(&b1);
    const half2_t* p_b2_half2 = c_style_pointer_cast<const half2_t*>(&b2);
    const half2_t* p_b3_half2 = c_style_pointer_cast<const half2_t*>(&b3);
Jing Zhang's avatar
Jing Zhang committed
143

144
145
    // do dot2 two times
    asm volatile("\n \
146
147
148
149
150
151
152
153
            v_dot2_f32_f16 %0, %4, %6,  %0\n \
            v_dot2_f32_f16 %1, %4, %8,  %1\n \
            v_dot2_f32_f16 %2, %4, %10, %2\n \
            v_dot2_f32_f16 %3, %4, %12, %3\n \
            v_dot2_f32_f16 %0, %5, %7,  %0\n \
            v_dot2_f32_f16 %1, %5, %9,  %1\n \
            v_dot2_f32_f16 %2, %5, %11, %2\n \
            v_dot2_f32_f16 %3, %5, %13, %3\n \
Jing Zhang's avatar
Jing Zhang committed
154
            "
Chao Liu's avatar
Chao Liu committed
155
                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
156
                 : "v"(p_a_half2[0]),
Chao Liu's avatar
Chao Liu committed
157
                   "v"(p_a_half2[1]),
158
159
160
                   "v"(p_b0_half2[0]),
                   "v"(p_b0_half2[1]),
                   "v"(p_b1_half2[0]),
Chao Liu's avatar
Chao Liu committed
161
                   "v"(p_b1_half2[1]),
162
163
164
                   "v"(p_b2_half2[0]),
                   "v"(p_b2_half2[1]),
                   "v"(p_b3_half2[0]),
Chao Liu's avatar
Chao Liu committed
165
                   "v"(p_b3_half2[1]),
166
167
168
                   "0"(c0),
                   "1"(c1),
                   "2"(c2),
Chao Liu's avatar
Chao Liu committed
169
170
171
                   "3"(c3));
}

172
173
174
175
176
177
178
179
180
181
182
__device__ void amd_assembly_outer_product_1x4(half8_t a,
                                               half8_t b0,
                                               half8_t b1,
                                               half8_t b2,
                                               half8_t b3,
                                               float& c0,
                                               float& c1,
                                               float& c2,
                                               float& c3)
{

183
    // TODO remove pointer casting
184
185
186
187
188
    const half4_t* p_a_half4  = c_style_pointer_cast<const half4_t*>(&a);
    const half4_t* p_b0_half4 = c_style_pointer_cast<const half4_t*>(&b0);
    const half4_t* p_b1_half4 = c_style_pointer_cast<const half4_t*>(&b1);
    const half4_t* p_b2_half4 = c_style_pointer_cast<const half4_t*>(&b2);
    const half4_t* p_b3_half4 = c_style_pointer_cast<const half4_t*>(&b3);
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

    amd_assembly_outer_product_1x4(
        p_a_half4[0], p_b0_half4[0], p_b1_half4[0], p_b2_half4[0], p_b3_half4[0], c0, c1, c2, c3);

    amd_assembly_outer_product_1x4(
        p_a_half4[1], p_b0_half4[1], p_b1_half4[1], p_b2_half4[1], p_b3_half4[1], c0, c1, c2, c3);
}

__device__ void amd_assembly_outer_product_1x4(half16_t a,
                                               half16_t b0,
                                               half16_t b1,
                                               half16_t b2,
                                               half16_t b3,
                                               float& c0,
                                               float& c1,
                                               float& c2,
                                               float& c3)
{
207
    // TODO remove pointer casting
208
209
210
211
212
    const half8_t* p_a_half8  = c_style_pointer_cast<const half8_t*>(&a);
    const half8_t* p_b0_half8 = c_style_pointer_cast<const half8_t*>(&b0);
    const half8_t* p_b1_half8 = c_style_pointer_cast<const half8_t*>(&b1);
    const half8_t* p_b2_half8 = c_style_pointer_cast<const half8_t*>(&b2);
    const half8_t* p_b3_half8 = c_style_pointer_cast<const half8_t*>(&b3);
213
214
215
216
217
218
219
220

    amd_assembly_outer_product_1x4(
        p_a_half8[0], p_b0_half8[0], p_b1_half8[0], p_b2_half8[0], p_b3_half8[0], c0, c1, c2, c3);

    amd_assembly_outer_product_1x4(
        p_a_half8[1], p_b0_half8[1], p_b1_half8[1], p_b2_half8[1], p_b3_half8[1], c0, c1, c2, c3);
}

Chao Liu's avatar
Chao Liu committed
221
222
223
224
225
226
227
228
229
230
231
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
__device__ void
amd_assembly_outer_product_1x2(int8x4_t a, int8x4_t b0, int8x4_t b1, int32_t& c0, int32_t& c1)
{
#if 1
    asm volatile("\n \
            v_dot4_i32_i8 %0, %2, %3, %0\n \
            v_dot4_i32_i8 %1, %2, %4, %1\n \
            "
                 : "=v"(c0), "=v"(c1)
232
233
234
                 : "v"(bit_cast<int32_t>(a)),
                   "v"(bit_cast<int32_t>(b0)),
                   "v"(bit_cast<int32_t>(b1)),
235
236
                   "0"(c0),
                   "1"(c1));
Chao Liu's avatar
Chao Liu committed
237
#else
zjing14's avatar
zjing14 committed
238
239
    c0 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b0), c0, false);
    c1 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b1), c1, false);
Chao Liu's avatar
Chao Liu committed
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#endif
}

// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
// c2 += inner_product(a, b2)
// c3 += inner_product(a, b3)
__device__ void amd_assembly_outer_product_1x4(int8x4_t a,
                                               int8x4_t b0,
                                               int8x4_t b1,
                                               int8x4_t b2,
                                               int8x4_t b3,
                                               int32_t& c0,
                                               int32_t& c1,
                                               int32_t& c2,
                                               int32_t& c3)
{
#if 1
    asm volatile("\n \
            v_dot4_i32_i8 %0, %4, %5, %0\n \
            v_dot4_i32_i8 %1, %4, %6, %1\n \
            v_dot4_i32_i8 %2, %4, %7, %2\n \
            v_dot4_i32_i8 %3, %4, %8, %3\n \
            "
                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
265
266
267
268
269
                 : "v"(bit_cast<int32_t>(a)),
                   "v"(bit_cast<int32_t>(b0)),
                   "v"(bit_cast<int32_t>(b1)),
                   "v"(bit_cast<int32_t>(b2)),
                   "v"(bit_cast<int32_t>(b3)),
270
271
272
273
                   "0"(c0),
                   "1"(c1),
                   "2"(c2),
                   "3"(c3));
Chao Liu's avatar
Chao Liu committed
274
#else
zjing14's avatar
zjing14 committed
275
276
277
278
    c0 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b0), c0, false);
    c1 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b1), c1, false);
    c2 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b2), c2, false);
    c3 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b3), c3, false);
Chao Liu's avatar
Chao Liu committed
279
#endif
Jing Zhang's avatar
Jing Zhang committed
280
}
281

282
283
284
285
286
287
288
289
290
291
__device__ void amd_assembly_outer_product_1x4(int8x8_t a,
                                               int8x8_t b0,
                                               int8x8_t b1,
                                               int8x8_t b2,
                                               int8x8_t b3,
                                               int32_t& c0,
                                               int32_t& c1,
                                               int32_t& c2,
                                               int32_t& c3)
{
292
293
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
294

295
296
297
298
299
    amd_assembly_outer_product_1x4(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 8>{b0}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 8>{b1}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 8>{b2}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 8>{b3}.AsType<int8x4_t>()[I0],
300
301
302
303
304
                                   c0,
                                   c1,
                                   c2,
                                   c3);

305
306
307
308
309
    amd_assembly_outer_product_1x4(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 8>{b0}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 8>{b1}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 8>{b2}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 8>{b3}.AsType<int8x4_t>()[I1],
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
                                   c0,
                                   c1,
                                   c2,
                                   c3);
}

__device__ void amd_assembly_outer_product_1x4(int8x16_t a,
                                               int8x16_t b0,
                                               int8x16_t b1,
                                               int8x16_t b2,
                                               int8x16_t b3,
                                               int32_t& c0,
                                               int32_t& c1,
                                               int32_t& c2,
                                               int32_t& c3)

{
327
328
329
330
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
331

332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I0],
                                   c0,
                                   c1,
                                   c2,
                                   c3);

    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I1],
                                   c0,
                                   c1,
                                   c2,
                                   c3);
351

352
353
354
355
356
    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I2],
                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I2],
                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I2],
                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I2],
357
358
359
360
361
                                   c0,
                                   c1,
                                   c2,
                                   c3);

362
363
364
365
366
    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I3],
                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I3],
                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I3],
                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I3],
367
368
369
370
371
372
                                   c0,
                                   c1,
                                   c2,
                                   c3);
}

373
374
} // namespace ck
#endif