"vllm/entrypoints/openai/api_server.py" did not exist on "eedb46bf03818796536358f3767ee2b6a619b4f5"
gelu.cu 6.42 KB
Newer Older
Przemek Tredak's avatar
Przemek Tredak committed
1
/*************************************************************************
2
 * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Przemek Tredak's avatar
Przemek Tredak committed
3
4
5
 *
 * See LICENSE for license information.
 ************************************************************************/
6

7
#include "../util/math.h"
8
#include "./activation_template.h"
Przemek Tredak's avatar
Przemek Tredak committed
9

10
void nvte_gelu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
11
  NVTE_API_CALL(nvte_gelu);
Przemek Tredak's avatar
Przemek Tredak committed
12
  using namespace transformer_engine;
13
  act_fn<fp32, Empty, gelu<fp32, fp32>>(input, output, stream);
14
15
}

16
17
18
19
20
21
22
23
void nvte_group_gelu(const NVTEGroupedTensor input, NVTEGroupedTensor output, cudaStream_t stream) {
  NVTE_API_CALL(nvte_group_gelu);
  using namespace transformer_engine;
  constexpr bool IS_ACT = true;
  dispatch::group_quantize_fwd_helper<IS_ACT, Empty, gelu<fp32, fp32>>(input, output, nullptr,
                                                                       stream);
}

24
void nvte_dgelu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
25
26
27
                cudaStream_t stream) {
  NVTE_API_CALL(nvte_dgelu);
  using namespace transformer_engine;
28
  dact_fn<fp32, Empty, dgelu<fp32, fp32>>(grad, input, output, stream);
Przemek Tredak's avatar
Przemek Tredak committed
29
}
30

31
32
33
34
35
36
37
38
39
40
41
42
43
44
void nvte_group_dgelu(const NVTEGroupedTensor grad, const NVTEGroupedTensor input,
                      NVTEGroupedTensor output, cudaStream_t stream) {
  NVTE_API_CALL(nvte_group_dgelu);
  using namespace transformer_engine;
  NVTETensor dbias = nullptr;
  NVTETensor workspace = nullptr;

  constexpr bool IS_DBIAS = false;
  constexpr bool IS_DACT = true;

  dispatch::group_quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, dgelu<fp32, fp32>>(
      grad, input, output, dbias, workspace, nullptr, stream);
}

45
46
47
48
49
50
51
52
53
54
55
56
57
void nvte_quantize_dbias_dgelu(const NVTETensor input, const NVTETensor activation_input,
                               NVTETensor output, NVTETensor dbias, NVTETensor workspace,
                               cudaStream_t stream) {
  NVTE_API_CALL(nvte_quantize_dbias_dgelu);
  using namespace transformer_engine;

  constexpr bool IS_DBIAS = true;
  constexpr bool IS_DACT = true;

  dispatch::quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, dgelu<fp32, fp32>>(
      input, activation_input, output, dbias, workspace, nullptr, stream);
}

58
59
60
61
62
63
64
65
66
67
68
69
70
71
void nvte_group_quantize_dbias_dgelu(const NVTEGroupedTensor input,
                                     const NVTEGroupedTensor activation_input,
                                     NVTEGroupedTensor output, NVTETensor dbias,
                                     NVTETensor workspace, cudaStream_t stream) {
  NVTE_API_CALL(nvte_group_quantize_dbias_dgelu);
  using namespace transformer_engine;

  constexpr bool IS_DBIAS = true;
  constexpr bool IS_DACT = true;

  dispatch::group_quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, dgelu<fp32, fp32>>(
      input, activation_input, output, dbias, workspace, nullptr, stream);
}

72
void nvte_geglu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
73
  NVTE_API_CALL(nvte_geglu);
74
  using namespace transformer_engine;
75
76
  Empty e = {};
  gated_act_fn<fp32, Empty, gelu<fp32, fp32>>(input, output, e, stream);
77
78
}

79
void nvte_dgeglu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
80
                 cudaStream_t stream) {
81
  NVTE_API_CALL(nvte_dgeglu);
82
  using namespace transformer_engine;
83
84
  Empty e = {};
  dgated_act_fn<fp32, Empty, gelu<fp32, fp32>, dgelu<fp32, fp32>>(grad, input, output, e, stream);
85
}
86

87
void nvte_qgelu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
88
89
  NVTE_API_CALL(nvte_qgelu);
  using namespace transformer_engine;
90
  act_fn<fp32, Empty, qgelu<fp32, fp32>>(input, output, stream);
91
92
}

93
94
95
96
97
98
99
100
101
void nvte_group_qgelu(const NVTEGroupedTensor input, NVTEGroupedTensor output,
                      cudaStream_t stream) {
  NVTE_API_CALL(nvte_group_qgelu);
  using namespace transformer_engine;
  constexpr bool IS_ACT = true;
  dispatch::group_quantize_fwd_helper<IS_ACT, Empty, qgelu<fp32, fp32>>(input, output, nullptr,
                                                                        stream);
}

102
103
void nvte_dqgelu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
                 cudaStream_t stream) {
104
105
  NVTE_API_CALL(nvte_dqgelu);
  using namespace transformer_engine;
106
  dact_fn<fp32, Empty, dqgelu<fp32, fp32>>(grad, input, output, stream);
107
}
108

109
110
111
112
113
114
115
116
117
118
119
120
121
122
void nvte_group_dqgelu(const NVTEGroupedTensor grad, const NVTEGroupedTensor input,
                       NVTEGroupedTensor output, cudaStream_t stream) {
  NVTE_API_CALL(nvte_group_dqgelu);
  using namespace transformer_engine;
  NVTETensor dbias = nullptr;
  NVTETensor workspace = nullptr;

  constexpr bool IS_DBIAS = false;
  constexpr bool IS_DACT = true;

  dispatch::group_quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, dqgelu<fp32, fp32>>(
      grad, input, output, dbias, workspace, nullptr, stream);
}

123
124
125
126
127
128
129
130
131
132
133
134
135
void nvte_quantize_dbias_dqgelu(const NVTETensor input, const NVTETensor activation_input,
                                NVTETensor output, NVTETensor dbias, NVTETensor workspace,
                                cudaStream_t stream) {
  NVTE_API_CALL(nvte_quantize_dbias_dqgelu);
  using namespace transformer_engine;

  constexpr bool IS_DBIAS = true;
  constexpr bool IS_DACT = true;

  dispatch::quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, dqgelu<fp32, fp32>>(
      input, activation_input, output, dbias, workspace, nullptr, stream);
}

136
137
138
139
140
141
142
143
144
145
146
147
148
149
void nvte_group_quantize_dbias_dqgelu(const NVTEGroupedTensor input,
                                      const NVTEGroupedTensor activation_input,
                                      NVTEGroupedTensor output, NVTETensor dbias,
                                      NVTETensor workspace, cudaStream_t stream) {
  NVTE_API_CALL(nvte_group_quantize_dbias_dqgelu);
  using namespace transformer_engine;

  constexpr bool IS_DBIAS = true;
  constexpr bool IS_DACT = true;

  dispatch::group_quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, dqgelu<fp32, fp32>>(
      input, activation_input, output, dbias, workspace, nullptr, stream);
}

150
void nvte_qgeglu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
151
152
  NVTE_API_CALL(nvte_qgeglu);
  using namespace transformer_engine;
153
154
  Empty e = {};
  gated_act_fn<fp32, Empty, qgelu<fp32, fp32>>(input, output, e, stream);
155
156
}

157
158
void nvte_dqgeglu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
                  cudaStream_t stream) {
159
160
  NVTE_API_CALL(nvte_dqgeglu);
  using namespace transformer_engine;
161
162
  Empty e = {};
  dgated_act_fn<fp32, Empty, qgelu<fp32, fp32>, dqgelu<fp32, fp32>>(grad, input, output, e, stream);
163
}