dnnl_helper.hpp 6.95 KB
Newer Older
1
2
3
4
#ifndef DNNL_HELPER_HPP
#define DNNL_HELPER_HPP

#include <c10/util/BFloat16.h>
5
#include <c10/util/Half.h>
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

#include "oneapi/dnnl/dnnl.hpp"

namespace {
template <typename T>
struct DNNLType {
  static constexpr dnnl::memory::data_type type =
      dnnl::memory::data_type::undef;
};

template <>
struct DNNLType<int8_t> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
};

template <>
struct DNNLType<int32_t> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
};

template <>
struct DNNLType<float> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
};

template <>
struct DNNLType<c10::BFloat16> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
};

36
37
38
39
40
template <>
struct DNNLType<c10::Half> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16;
};

41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
template <typename T>
constexpr inline dnnl::memory::data_type get_dnnl_type() {
  return DNNLType<std::decay_t<T>>::type;
}
};  // namespace

template <bool InputNoScale>
class DNNLPrimitiveHelper {
 public:
  // I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias)
  // A: [M, K], row-major
  // B: [K, N], column-major
  // C: [M, N], row-major
  // bias: [N], row-major, optional
  // a_scales: [MS]
  // b_scales: [NS]
  // Note: Due to the limitation of oneDNN
  // (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is
  // not supported.
60

61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
  template <typename OutputT, typename BiasT>
  static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c,
                            const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N,
                            dnnl_dim_t K, const float* a_scales,
                            const float* b_scales, dnnl_dim_t MS,
                            dnnl_dim_t NS) {
    auto&& OutputType = get_dnnl_type<OutputT>();
    auto&& BiasType = get_dnnl_type<BiasT>();

    dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1});
    dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K});
    dnnl::memory::desc c_md({M, N}, OutputType, {N, 1});

    dnnl::primitive_attr attr;
    if constexpr (!InputNoScale) {
      if (MS == 1) {
        // per-tensor
        attr.set_scales_mask(DNNL_ARG_SRC, 0);
      } else {
        // per-token
        TORCH_CHECK(false, "per-token quantization is unsupported.");
      }
    }

    if (NS == 1) {
      // per-tensor
      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
    } else {
      // per-channel
      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
    }

    dnnl::matmul::primitive_desc matmul_pd;
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
// Create memory descriptors with format_tag::any for the primitive. This
// enables the matmul primitive to choose memory layouts for an
// optimized primitive implementation, and these layouts may differ from the
// ones provided by the user.
#ifdef __aarch64__
    auto mat_src_md = dnnl::memory::desc({M, K}, dnnl::memory::data_type::s8,
                                         dnnl::memory::format_tag::any);
    auto mat_weights_md = dnnl::memory::desc(
        {K, N}, dnnl::memory::data_type::s8, dnnl::memory::format_tag::any);
    auto mat_dst_md =
        dnnl::memory::desc({M, N}, OutputType, dnnl::memory::format_tag::any);
    if (bias) {
      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), mat_src_md,
                                               mat_weights_md, bias_md,
                                               mat_dst_md, attr);
    } else {
      matmul_pd = dnnl::matmul::primitive_desc(
          default_engine(), mat_src_md, mat_weights_md, mat_dst_md, attr);
    }
#else
115
116
117
118
119
120
121
122
    if (bias) {
      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
                                               bias_md, c_md, attr);
    } else {
      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
                                               c_md, attr);
    }
123
#endif
124
125
126
127
128
129
130
131
132
133
134
135
136
    dnnl::matmul matmul(matmul_pd);

    auto& engine = default_engine();

    dnnl::memory a_m(a_md, engine, (void*)a);
    dnnl::memory b_m(b_md, engine, (void*)b);
    dnnl::memory c_m(c_md, engine, (void*)c);
    dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine,
                            (void*)a_scales);
    dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine,
                            (void*)b_scales);

    auto& stream = default_stream();
137
138
139
140
141
142
143
144
145
146

    auto mat_src_mem = a_m;
    auto mat_weights_mem = b_m;
    auto mat_dst_mem = c_m;
#ifdef __aarch64__
    if (matmul_pd.weights_desc() != b_m.get_desc()) {
      mat_weights_mem = dnnl::memory(matmul_pd.weights_desc(), engine);
      dnnl::reorder(b_m, mat_weights_mem).execute(stream, b_m, mat_weights_mem);
    }
#endif
147
148
149
150
151
152
    if constexpr (InputNoScale) {
      if (bias) {
        dnnl::memory::desc bias_md({N}, BiasType, {1});
        dnnl::memory bias_m(bias_md, engine, (void*)bias);
        matmul.execute(
            stream, {
153
154
                        {DNNL_ARG_SRC, mat_src_mem},
                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
155
                        {DNNL_ARG_BIAS, bias_m},
156
                        {DNNL_ARG_DST, mat_dst_mem},
157
158
159
160
161
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      } else {
        matmul.execute(
            stream, {
162
163
164
                        {DNNL_ARG_SRC, mat_src_mem},
                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
                        {DNNL_ARG_DST, mat_dst_mem},
165
166
167
168
169
170
171
172
173
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      }
    } else {
      if (bias) {
        dnnl::memory::desc bias_md({N}, BiasType, {1});
        dnnl::memory bias_m(bias_md, engine, (void*)bias);
        matmul.execute(
            stream, {
174
175
                        {DNNL_ARG_SRC, mat_src_mem},
                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
176
                        {DNNL_ARG_BIAS, bias_m},
177
                        {DNNL_ARG_DST, mat_dst_mem},
178
179
180
181
182
183
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      } else {
        matmul.execute(
            stream, {
184
185
186
                        {DNNL_ARG_SRC, mat_src_mem},
                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
                        {DNNL_ARG_DST, mat_dst_mem},
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      }
    }
    stream.wait();
  }

 private:
  static dnnl::engine& default_engine() {
    static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
    return engine;
  }

  static dnnl::stream& default_stream() {
    static dnnl::stream stream(default_engine());
    return stream;
  }
};
#endif