amd_inline_asm.hpp 15.1 KB
Newer Older
1
2
3
#ifndef CK_AMD_INLINE_ASM_HPP
#define CK_AMD_INLINE_ASM_HPP

4
#include "data_type.hpp"
5
#include "c_style_pointer_cast.hpp"
Jing Zhang's avatar
Jing Zhang committed
6

7
8
#ifndef CK_NOGPU

9
10
// TODO: deprecate all amd_assembly_outer_product_xxx

11
12
namespace ck {

Chao Liu's avatar
Chao Liu committed
13
14
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
15
__device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
Chao Liu's avatar
Chao Liu committed
16
{
Chao Liu's avatar
Chao Liu committed
17
18
19
20
21
22
    asm volatile("\n \
            v_fmac_f32 %0, %2, %3 \n \
            v_fmac_f32 %1, %2, %4 \n \
            "
                 : "=v"(c0), "=v"(c1)
                 : "v"(a), "v"(b0), "v"(b1), "0"(c0), "1"(c1));
Chao Liu's avatar
Chao Liu committed
23
24
}

Chao Liu's avatar
Chao Liu committed
25
26
27
28
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
// c2 += inner_product(a, b2)
// c3 += inner_product(a, b3)
29
__device__ void amd_assembly_outer_product_1x4(
30
    float a, float b0, float b1, float b2, float b3, float& c0, float& c1, float& c2, float& c3)
Chao Liu's avatar
Chao Liu committed
31
{
Chao Liu's avatar
Chao Liu committed
32
33
34
35
36
37
38
39
    asm volatile("\n \
            v_fmac_f32 %0, %4, %5 \n \
            v_fmac_f32 %1, %4, %6 \n \
            v_fmac_f32 %2, %4, %7 \n \
            v_fmac_f32 %3, %4, %8 \n \
            "
                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
                 : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3));
Jing Zhang's avatar
Jing Zhang committed
40
41
}

Chao Liu's avatar
Chao Liu committed
42
43
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
44
45
__device__ void
amd_assembly_outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0, float& c1)
Chao Liu's avatar
Chao Liu committed
46
{
47
    asm volatile("\n \
48
49
            v_dot2_f32_f16 %0, %2, %3, %0\n \
            v_dot2_f32_f16 %1, %2, %4, %1\n \
50
            "
Chao Liu's avatar
Chao Liu committed
51
52
                 : "=v"(c0), "=v"(c1)
                 : "v"(a), "v"(b0), "v"(b1), "0"(c0), "1"(c1));
Jing Zhang's avatar
Jing Zhang committed
53
54
}

Chao Liu's avatar
Chao Liu committed
55
56
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
57
58
__device__ void
amd_assembly_outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0, float& c1)
Chao Liu's avatar
Chao Liu committed
59
{
60
    // TODO remove pointer casting
61
62
63
    const half2_t* p_a_half2  = c_style_pointer_cast<const half2_t*>(&a);
    const half2_t* p_b0_half2 = c_style_pointer_cast<const half2_t*>(&b0);
    const half2_t* p_b1_half2 = c_style_pointer_cast<const half2_t*>(&b1);
Chao Liu's avatar
Chao Liu committed
64

65
66
    // do dot2 two times
    asm volatile("\n \
67
68
69
70
            v_dot2_f32_f16 %0, %2, %4, %0\n \
            v_dot2_f32_f16 %1, %2, %6, %1\n \
            v_dot2_f32_f16 %0, %3, %5, %0\n \
            v_dot2_f32_f16 %1, %3, %7, %1\n \
71
            "
Chao Liu's avatar
Chao Liu committed
72
                 : "=v"(c0), "=v"(c1)
73
                 : "v"(p_a_half2[0]),
Chao Liu's avatar
Chao Liu committed
74
                   "v"(p_a_half2[1]),
75
76
77
                   "v"(p_b0_half2[0]),
                   "v"(p_b0_half2[1]),
                   "v"(p_b1_half2[0]),
Chao Liu's avatar
Chao Liu committed
78
                   "v"(p_b1_half2[1]),
79
                   "0"(c0),
Chao Liu's avatar
Chao Liu committed
80
                   "1"(c1));
Jing Zhang's avatar
Jing Zhang committed
81
82
}

Chao Liu's avatar
Chao Liu committed
83
84
85
86
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
// c2 += inner_product(a, b2)
// c3 += inner_product(a, b3)
87
88
89
90
91
92
93
94
95
__device__ void amd_assembly_outer_product_1x4(half2_t a,
                                               half2_t b0,
                                               half2_t b1,
                                               half2_t b2,
                                               half2_t b3,
                                               float& c0,
                                               float& c1,
                                               float& c2,
                                               float& c3)
Jing Zhang's avatar
Jing Zhang committed
96
{
97
    asm volatile("\n \
98
99
100
101
            v_dot2_f32_f16 %0, %4, %5, %0\n \
            v_dot2_f32_f16 %1, %4, %6, %1\n \
            v_dot2_f32_f16 %2, %4, %7, %2\n \
            v_dot2_f32_f16 %3, %4, %8, %3\n \
102
            "
Chao Liu's avatar
Chao Liu committed
103
104
                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
                 : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3));
Jing Zhang's avatar
Jing Zhang committed
105
106
}

Chao Liu's avatar
Chao Liu committed
107
108
109
110
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
// c2 += inner_product(a, b2)
// c3 += inner_product(a, b3)
111
112
113
114
115
116
117
118
119
__device__ void amd_assembly_outer_product_1x4(half4_t a,
                                               half4_t b0,
                                               half4_t b1,
                                               half4_t b2,
                                               half4_t b3,
                                               float& c0,
                                               float& c1,
                                               float& c2,
                                               float& c3)
Jing Zhang's avatar
Jing Zhang committed
120
{
121
    // TODO remove pointer casting
122
123
124
125
126
    const half2_t* p_a_half2  = c_style_pointer_cast<const half2_t*>(&a);
    const half2_t* p_b0_half2 = c_style_pointer_cast<const half2_t*>(&b0);
    const half2_t* p_b1_half2 = c_style_pointer_cast<const half2_t*>(&b1);
    const half2_t* p_b2_half2 = c_style_pointer_cast<const half2_t*>(&b2);
    const half2_t* p_b3_half2 = c_style_pointer_cast<const half2_t*>(&b3);
Jing Zhang's avatar
Jing Zhang committed
127

128
129
    // do dot2 two times
    asm volatile("\n \
130
131
132
133
134
135
136
137
            v_dot2_f32_f16 %0, %4, %6,  %0\n \
            v_dot2_f32_f16 %1, %4, %8,  %1\n \
            v_dot2_f32_f16 %2, %4, %10, %2\n \
            v_dot2_f32_f16 %3, %4, %12, %3\n \
            v_dot2_f32_f16 %0, %5, %7,  %0\n \
            v_dot2_f32_f16 %1, %5, %9,  %1\n \
            v_dot2_f32_f16 %2, %5, %11, %2\n \
            v_dot2_f32_f16 %3, %5, %13, %3\n \
Jing Zhang's avatar
Jing Zhang committed
138
            "
Chao Liu's avatar
Chao Liu committed
139
                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
140
                 : "v"(p_a_half2[0]),
Chao Liu's avatar
Chao Liu committed
141
                   "v"(p_a_half2[1]),
142
143
144
                   "v"(p_b0_half2[0]),
                   "v"(p_b0_half2[1]),
                   "v"(p_b1_half2[0]),
Chao Liu's avatar
Chao Liu committed
145
                   "v"(p_b1_half2[1]),
146
147
148
                   "v"(p_b2_half2[0]),
                   "v"(p_b2_half2[1]),
                   "v"(p_b3_half2[0]),
Chao Liu's avatar
Chao Liu committed
149
                   "v"(p_b3_half2[1]),
150
151
152
                   "0"(c0),
                   "1"(c1),
                   "2"(c2),
Chao Liu's avatar
Chao Liu committed
153
154
155
                   "3"(c3));
}

156
157
158
159
160
161
162
163
164
165
166
__device__ void amd_assembly_outer_product_1x4(half8_t a,
                                               half8_t b0,
                                               half8_t b1,
                                               half8_t b2,
                                               half8_t b3,
                                               float& c0,
                                               float& c1,
                                               float& c2,
                                               float& c3)
{

167
    // TODO remove pointer casting
168
169
170
171
172
    const half4_t* p_a_half4  = c_style_pointer_cast<const half4_t*>(&a);
    const half4_t* p_b0_half4 = c_style_pointer_cast<const half4_t*>(&b0);
    const half4_t* p_b1_half4 = c_style_pointer_cast<const half4_t*>(&b1);
    const half4_t* p_b2_half4 = c_style_pointer_cast<const half4_t*>(&b2);
    const half4_t* p_b3_half4 = c_style_pointer_cast<const half4_t*>(&b3);
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190

    amd_assembly_outer_product_1x4(
        p_a_half4[0], p_b0_half4[0], p_b1_half4[0], p_b2_half4[0], p_b3_half4[0], c0, c1, c2, c3);

    amd_assembly_outer_product_1x4(
        p_a_half4[1], p_b0_half4[1], p_b1_half4[1], p_b2_half4[1], p_b3_half4[1], c0, c1, c2, c3);
}

__device__ void amd_assembly_outer_product_1x4(half16_t a,
                                               half16_t b0,
                                               half16_t b1,
                                               half16_t b2,
                                               half16_t b3,
                                               float& c0,
                                               float& c1,
                                               float& c2,
                                               float& c3)
{
191
    // TODO remove pointer casting
192
193
194
195
196
    const half8_t* p_a_half8  = c_style_pointer_cast<const half8_t*>(&a);
    const half8_t* p_b0_half8 = c_style_pointer_cast<const half8_t*>(&b0);
    const half8_t* p_b1_half8 = c_style_pointer_cast<const half8_t*>(&b1);
    const half8_t* p_b2_half8 = c_style_pointer_cast<const half8_t*>(&b2);
    const half8_t* p_b3_half8 = c_style_pointer_cast<const half8_t*>(&b3);
197
198
199
200
201
202
203
204

    amd_assembly_outer_product_1x4(
        p_a_half8[0], p_b0_half8[0], p_b1_half8[0], p_b2_half8[0], p_b3_half8[0], c0, c1, c2, c3);

    amd_assembly_outer_product_1x4(
        p_a_half8[1], p_b0_half8[1], p_b1_half8[1], p_b2_half8[1], p_b3_half8[1], c0, c1, c2, c3);
}

Chao Liu's avatar
Chao Liu committed
205
206
207
208
209
210
211
212
213
214
215
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
__device__ void
amd_assembly_outer_product_1x2(int8x4_t a, int8x4_t b0, int8x4_t b1, int32_t& c0, int32_t& c1)
{
#if 1
    asm volatile("\n \
            v_dot4_i32_i8 %0, %2, %3, %0\n \
            v_dot4_i32_i8 %1, %2, %4, %1\n \
            "
                 : "=v"(c0), "=v"(c1)
216
217
218
                 : "v"(bit_cast<int32_t>(a)),
                   "v"(bit_cast<int32_t>(b0)),
                   "v"(bit_cast<int32_t>(b1)),
219
220
                   "0"(c0),
                   "1"(c1));
Chao Liu's avatar
Chao Liu committed
221
#else
222
223
    c0 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b0), c0, false);
    c1 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b1), c1, false);
Chao Liu's avatar
Chao Liu committed
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#endif
}

// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
// c2 += inner_product(a, b2)
// c3 += inner_product(a, b3)
__device__ void amd_assembly_outer_product_1x4(int8x4_t a,
                                               int8x4_t b0,
                                               int8x4_t b1,
                                               int8x4_t b2,
                                               int8x4_t b3,
                                               int32_t& c0,
                                               int32_t& c1,
                                               int32_t& c2,
                                               int32_t& c3)
{
#if 1
    asm volatile("\n \
            v_dot4_i32_i8 %0, %4, %5, %0\n \
            v_dot4_i32_i8 %1, %4, %6, %1\n \
            v_dot4_i32_i8 %2, %4, %7, %2\n \
            v_dot4_i32_i8 %3, %4, %8, %3\n \
            "
                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
249
250
251
252
253
                 : "v"(bit_cast<int32_t>(a)),
                   "v"(bit_cast<int32_t>(b0)),
                   "v"(bit_cast<int32_t>(b1)),
                   "v"(bit_cast<int32_t>(b2)),
                   "v"(bit_cast<int32_t>(b3)),
254
255
256
257
                   "0"(c0),
                   "1"(c1),
                   "2"(c2),
                   "3"(c3));
Chao Liu's avatar
Chao Liu committed
258
#else
259
260
261
262
    c0 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b0), c0, false);
    c1 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b1), c1, false);
    c2 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b2), c2, false);
    c3 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b3), c3, false);
Chao Liu's avatar
Chao Liu committed
263
#endif
Jing Zhang's avatar
Jing Zhang committed
264
}
265

266
267
268
269
270
271
272
273
274
275
__device__ void amd_assembly_outer_product_1x4(int8x8_t a,
                                               int8x8_t b0,
                                               int8x8_t b1,
                                               int8x8_t b2,
                                               int8x8_t b3,
                                               int32_t& c0,
                                               int32_t& c1,
                                               int32_t& c2,
                                               int32_t& c3)
{
276
277
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
278

279
280
281
282
283
    amd_assembly_outer_product_1x4(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 8>{b0}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 8>{b1}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 8>{b2}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 8>{b3}.AsType<int8x4_t>()[I0],
284
285
286
287
288
                                   c0,
                                   c1,
                                   c2,
                                   c3);

289
290
291
292
293
    amd_assembly_outer_product_1x4(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 8>{b0}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 8>{b1}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 8>{b2}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 8>{b3}.AsType<int8x4_t>()[I1],
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
                                   c0,
                                   c1,
                                   c2,
                                   c3);
}

__device__ void amd_assembly_outer_product_1x4(int8x16_t a,
                                               int8x16_t b0,
                                               int8x16_t b1,
                                               int8x16_t b2,
                                               int8x16_t b3,
                                               int32_t& c0,
                                               int32_t& c1,
                                               int32_t& c2,
                                               int32_t& c3)

{
311
312
313
314
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
315

316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I0],
                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I0],
                                   c0,
                                   c1,
                                   c2,
                                   c3);

    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I1],
                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I1],
                                   c0,
                                   c1,
                                   c2,
                                   c3);
335

336
337
338
339
340
    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I2],
                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I2],
                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I2],
                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I2],
341
342
343
344
345
                                   c0,
                                   c1,
                                   c2,
                                   c3);

346
347
348
349
350
    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I3],
                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I3],
                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I3],
                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I3],
351
352
353
354
355
356
                                   c0,
                                   c1,
                                   c2,
                                   c3);
}

357
358
} // namespace ck
#endif
359
#endif