cpu_support.h 10.2 KB
Newer Older
1
/**
2
 *  Copyright (c) 2019 by Contributors
3
4
5
 * @file intel/cpu_support.h
 * @brief Intel CPU support
 * @author Pawel Piotrowicz <pawel.piotrowicz@intel.com>
6
7
8
9
10
11
 */
#ifndef INTEL_CPU_SUPPORT_H_
#define INTEL_CPU_SUPPORT_H_
#include <memory>
#include <tuple>
#include <type_traits>
12

13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#include "dmlc/logging.h"
#include "meta_utils.h"
#include "xbyak/xbyak.h"
#include "xbyak/xbyak_util.h"

namespace dgl {

typedef std::tuple<float, double> supported_types;

#ifndef log_intel
#define log_intel(x)                   \
  if (IntelKernel<>::IsLogEnabled()) { \
    LOG(INFO) << x;                    \
  }
#endif

static inline Xbyak::Zmm make_zmm(const Xbyak::Xmm &v) {
  return Xbyak::Zmm(v.getIdx());
}
template <int version = 0>
struct IntelKernel {
  static int64_t GetValue() {
    int64_t v = 0;
    const char *label = "DGL_CPU_INTEL_KERNEL_ENABLED";
    const char *ptr = std::getenv(label);
    if (ptr) {
      v = atoll(ptr);
      log_intel(label << "=>" << v);
    }
    return v;
  }

  static int64_t IsEnabled() {
    static int64_t r = IntelKernel<version>::GetValue();
    return r;
  }

  static int IsLogEnabled() {
    static int r = (std::getenv("DGL_CPU_INTEL_KERNEL_LOG")) ? 1 : 0;
    return r;
  }
};

56
/**
57
58
 * @brief Element-wise addition kernel using Intel AVX512 instructions.
 * @note it uses AVX512.
59
60
61
62
63
64
 */
template <class Op>
class ElemWiseAddUpdate : public Xbyak::CodeGenerator {
 public:
  typedef typename Op::type DType;
  static_assert(
65
66
67
68
      std::is_base_of<
          std::true_type, utils::has_type<DType, supported_types>>::value,
      "Use case fail dgl::ElemWiseAddUpdate< Operator<DType> > DType is not "
      "supported !");
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83

 protected:
  const Xbyak::Reg64 &r_out_;
  const Xbyak::Reg64 &r_left_;
  const Xbyak::Reg64 &r_right;
  const Xbyak::Reg64 &r_size_;

  /* [functional] Does kernel is applicable on this machine ? */
  bool applicable_;

 public:
  static constexpr int UNIT_SIZE_BYTES = sizeof(DType);
  static constexpr int BITS_IN_BYTES = 8;
  static constexpr int REG_BIT_SIZE = 512;
  static constexpr int UNIT_PER_REG =
84
      REG_BIT_SIZE / (UNIT_SIZE_BYTES * BITS_IN_BYTES);
85

86
87
  template <
      class TType, class R1, class R2, utils::CheckCmp<TType, float> = true>
88
89
90
  void alias_load(R1 r1, R2 r2) {
    vmovups(r1, r2);
  }
91
92
  template <
      class TType, class R1, class R2, utils::CheckCmp<TType, double> = true>
93
94
95
96
  void alias_load(R1 r1, R2 r2) {
    vmovupd(r1, r2);
  }

97
98
  template <
      class TType, class R1, class R2, utils::CheckCmp<TType, float> = true>
99
100
101
  void alias_save(R1 r1, R2 r2) {
    alias_load<TType>(r1, r2);
  }
102
103
  template <
      class TType, class R1, class R2, utils::CheckCmp<TType, double> = true>
104
105
106
107
  void alias_save(R1 r1, R2 r2) {
    alias_load<TType>(r1, r2);
  }

108
109
110
  template <
      class TType, class R1, class R2, class R3,
      utils::CheckCmp<TType, float> = true>
111
112
113
  void alias_ADD(R1 r1, R2 r2, R3 r3) {
    vaddps(r1, r2, r3);
  }
114
115
116
  template <
      class TType, class R1, class R2, class R3,
      utils::CheckCmp<TType, double> = true>
117
118
119
120
  void alias_ADD(R1 r1, R2 r2, R3 r3) {
    vaddpd(r1, r2, r3);
  }

121
122
123
  template <
      class TType, class R1, class R2, class R3,
      utils::CheckCmp<TType, float> = true>
124
125
126
  void alias_SUB(R1 r1, R2 r2, R3 r3) {
    vsubps(r1, r2, r3);
  }
127
128
129
  template <
      class TType, class R1, class R2, class R3,
      utils::CheckCmp<TType, double> = true>
130
131
132
133
  void alias_SUB(R1 r1, R2 r2, R3 r3) {
    vsubpd(r1, r2, r3);
  }

134
135
136
  template <
      class TType, class R1, class R2, class R3,
      utils::CheckCmp<TType, float> = true>
137
138
139
  void alias_DIV(R1 r1, R2 r2, R3 r3) {
    vdivps(r1, r2, r3);
  }
140
141
142
  template <
      class TType, class R1, class R2, class R3,
      utils::CheckCmp<TType, double> = true>
143
144
145
146
  void alias_DIV(R1 r1, R2 r2, R3 r3) {
    vdivpd(r1, r2, r3);
  }

147
148
149
  template <
      class TType, class R1, class R2, class R3,
      utils::CheckCmp<TType, float> = true>
150
151
152
  void alias_MUL(R1 r1, R2 r2, R3 r3) {
    vmulps(r1, r2, r3);
  }
153
154
155
  template <
      class TType, class R1, class R2, class R3,
      utils::CheckCmp<TType, double> = true>
156
157
158
159
  void alias_MUL(R1 r1, R2 r2, R3 r3) {
    vmulpd(r1, r2, r3);
  }

160
161
162
163
  template <
      class Operator,
      utils::Verify<Operator, ::dgl::aten::cpu::op::CopyLhs, supported_types> =
          true>
164
165
166
167
168
169
170
  void full_chunk_loop_operations() {
    typedef typename Operator::type IType;
    alias_load<IType>(zmm0, ptr[r_out_ + r9 * sizeof(IType)]);
    alias_load<IType>(zmm1, ptr[r_left_ + r9 * sizeof(IType)]);
    alias_ADD<IType>(zmm2, zmm0, zmm1);
    alias_save<IType>(ptr[r_out_ + r9 * sizeof(IType)], zmm2);
  }
171
172
173
174
  template <
      class Operator,
      utils::Verify<Operator, ::dgl::aten::cpu::op::CopyRhs, supported_types> =
          true>
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
  void full_chunk_loop_operations() {
    typedef typename Operator::type IType;
    alias_load<IType>(zmm0, ptr[r_out_ + r9 * sizeof(IType)]);
    alias_load<IType>(zmm1, ptr[r_right + r9 * sizeof(IType)]);
    alias_ADD<IType>(zmm2, zmm0, zmm1);
    alias_save<IType>(ptr[r_out_ + r9 * sizeof(IType)], zmm2);
  }
  template <class T>
  void loop_pre() {
    alias_load<T>(zmm0, ptr[r_out_ + r9 * sizeof(T)]);
    alias_load<T>(zmm1, ptr[r_left_ + r9 * sizeof(T)]);
    alias_load<T>(zmm2, ptr[r_right + r9 * sizeof(T)]);
  }
  template <class T>
  void loop_post() {
    alias_ADD<T>(zmm2, zmm0, zmm2);
    alias_save<T>(ptr[r_out_ + r9 * sizeof(T)], zmm2);
  }
193
194
195
196
  template <
      class Operator,
      utils::Verify<Operator, ::dgl::aten::cpu::op::Add, supported_types> =
          true>
197
198
199
200
201
202
  void full_chunk_loop_operations() {
    typedef typename Operator::type IType;
    loop_pre<IType>();
    alias_ADD<IType>(zmm2, zmm1, zmm2);
    loop_post<IType>();
  }
203
204
205
206
  template <
      class Operator,
      utils::Verify<Operator, ::dgl::aten::cpu::op::Sub, supported_types> =
          true>
207
208
209
210
211
212
213
  void full_chunk_loop_operations() {
    typedef typename Operator::type IType;
    loop_pre<IType>();
    alias_SUB<IType>(zmm2, zmm1, zmm2);
    loop_post<IType>();
  }

214
215
216
217
  template <
      class Operator,
      utils::Verify<Operator, ::dgl::aten::cpu::op::Div, supported_types> =
          true>
218
219
220
221
222
223
224
  void full_chunk_loop_operations() {
    typedef typename Operator::type IType;
    loop_pre<IType>();
    alias_DIV<IType>(zmm2, zmm1, zmm2);
    loop_post<IType>();
  }

225
226
227
228
  template <
      class Operator,
      utils::Verify<Operator, ::dgl::aten::cpu::op::Mul, supported_types> =
          true>
229
230
231
232
233
234
235
  void full_chunk_loop_operations() {
    typedef typename Operator::type IType;
    loop_pre<IType>();
    alias_MUL<IType>(zmm2, zmm1, zmm2);
    loop_post<IType>();
  }

236
237
238
239
  template <
      class Operator,
      utils::Verify<Operator, ::dgl::aten::cpu::op::CopyLhs, supported_types> =
          true>
240
241
242
243
244
  void remainder_operations(const Xbyak::Opmask mask) {
    typedef typename Operator::type IType;
    alias_load<IType>(make_zmm(zmm2) | mask, ptr[r_left_ + r9 * sizeof(IType)]);
  }

245
246
247
248
  template <
      class Operator,
      utils::Verify<Operator, ::dgl::aten::cpu::op::CopyRhs, supported_types> =
          true>
249
250
251
252
253
254
255
256
257
258
259
  void remainder_operations(const Xbyak::Opmask mask) {
    typedef typename Operator::type IType;
    alias_load<IType>(make_zmm(zmm2) | mask, ptr[r_right + r9 * sizeof(IType)]);
  }

  template <class T>
  void remainder_fetch_LR(const Xbyak::Opmask mask) {
    alias_load<T>(make_zmm(zmm2) | mask, ptr[r_left_ + r9 * sizeof(T)]);
    alias_load<T>(make_zmm(zmm1) | mask, ptr[r_right + r9 * sizeof(T)]);
  }

260
261
262
263
  template <
      class Operator,
      utils::Verify<Operator, ::dgl::aten::cpu::op::Mul, supported_types> =
          true>
264
265
266
267
268
269
  void remainder_operations(const Xbyak::Opmask mask) {
    typedef typename Operator::type IType;
    remainder_fetch_LR<IType>(mask);
    alias_MUL<IType>(zmm2, zmm2, zmm1);
  }

270
271
272
273
  template <
      class Operator,
      utils::Verify<Operator, ::dgl::aten::cpu::op::Add, supported_types> =
          true>
274
275
276
277
278
279
  void remainder_operations(const Xbyak::Opmask mask) {
    typedef typename Operator::type IType;
    remainder_fetch_LR<IType>(mask);
    alias_ADD<DType>(zmm2, zmm2, zmm1);
  }

280
281
282
283
  template <
      class Operator,
      utils::Verify<Operator, ::dgl::aten::cpu::op::Div, supported_types> =
          true>
284
285
286
287
288
289
  void remainder_operations(const Xbyak::Opmask mask) {
    typedef typename Operator::type IType;
    remainder_fetch_LR<IType>(mask);
    alias_DIV<DType>(zmm2, zmm2, zmm1);
  }

290
291
292
293
  template <
      class Operator,
      utils::Verify<Operator, ::dgl::aten::cpu::op::Sub, supported_types> =
          true>
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
  void remainder_operations(const Xbyak::Opmask mask) {
    typedef typename Operator::type IType;
    remainder_fetch_LR<IType>(mask);
    alias_SUB<DType>(zmm2, zmm2, zmm1);
  }

  ElemWiseAddUpdate()
      : r_out_(rdi),
        r_left_(rsi),
        r_right(rdx),
        r_size_(rcx),
        applicable_(false) {
    static Xbyak::util::Cpu current_cpu;

    /* Default case for all */
    if (current_cpu.has(Xbyak::util::Cpu::tAVX512F)) {
      /* prepare REMAINDER */
      mov(r8, r_size_);
312
313
314
315
      and_(
          r8,
          UNIT_PER_REG - 1);  // r8_modulo = size/(sizeof(zmm)/sizeof(float))
      xor_(r9, r9);           // reset r9
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
      cmp(r_size_, UNIT_PER_REG);  // if ( size < 16 ) {  }
      jl("remainder");

      /*  decrease  divident */
      sub(r_size_, r8);  // prepare alignment chunks
      cmp(r_size_, 0);   // do we have any full chunks ?
      jz("remainder");

      L("for_i");
      full_chunk_loop_operations<Op>();
      add(r9, UNIT_PER_REG);  // r9+=sizeof(zmm)/sizeof(float)
      cmp(r_size_, r9);       // more full chunks ?
      jnz("for_i");

      L("remainder");
      cmp(r8, 0);  //  do we have a remainder ?
      jz("done");
      /* prepare a bitmask for k1 */
      mov(rax, 1);
      mov(r_size_, r8);
      sal(rax, cl);
      dec(rax);        // k1= (1 << r8 )-1
      kmovw(k1, eax);  // set bitmask
339
340
      alias_load<DType>(
          make_zmm(zmm0) | k1, ptr[r_out_ + r9 * UNIT_SIZE_BYTES]);
341
342
      remainder_operations<Op>(k1);
      alias_ADD<DType>(zmm3, zmm2, zmm0);
343
344
      alias_save<DType>(
          ptr[r_out_ + r9 * UNIT_SIZE_BYTES], make_zmm(zmm3) | k1);
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
      L("done");
      applicable_ = true;
      log_intel("AVX512F cpu kernel is ready");
    }
    ret();
  }

  bool applicable() const { return applicable_; }

  template <class... P>
  void run(P... args) {
    ((void (*)(P...))(this)->getCode())(args...);
  }
};

}  // namespace dgl

#endif  // INTEL_CPU_SUPPORT_H_