gelu.cu 3.41 KB
Newer Older
Przemek Tredak's avatar
Przemek Tredak committed
1
/*************************************************************************
2
 * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Przemek Tredak's avatar
Przemek Tredak committed
3
4
5
 *
 * See LICENSE for license information.
 ************************************************************************/
6

7
#include "../util/math.h"
8
#include "./activation_template.h"
Przemek Tredak's avatar
Przemek Tredak committed
9

10
void nvte_gelu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
11
  NVTE_API_CALL(nvte_gelu);
Przemek Tredak's avatar
Przemek Tredak committed
12
  using namespace transformer_engine;
13
  act_fn<fp32, Empty, gelu<fp32, fp32>>(input, output, stream);
14
15
}

16
void nvte_dgelu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
17
18
19
                cudaStream_t stream) {
  NVTE_API_CALL(nvte_dgelu);
  using namespace transformer_engine;
20
  dact_fn<fp32, Empty, dgelu<fp32, fp32>>(grad, input, output, stream);
Przemek Tredak's avatar
Przemek Tredak committed
21
}
22

23
24
25
26
27
28
29
30
31
32
33
34
35
void nvte_quantize_dbias_dgelu(const NVTETensor input, const NVTETensor activation_input,
                               NVTETensor output, NVTETensor dbias, NVTETensor workspace,
                               cudaStream_t stream) {
  NVTE_API_CALL(nvte_quantize_dbias_dgelu);
  using namespace transformer_engine;

  constexpr bool IS_DBIAS = true;
  constexpr bool IS_DACT = true;

  dispatch::quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, dgelu<fp32, fp32>>(
      input, activation_input, output, dbias, workspace, nullptr, stream);
}

36
void nvte_geglu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
37
  NVTE_API_CALL(nvte_geglu);
38
  using namespace transformer_engine;
39
40
  Empty e = {};
  gated_act_fn<fp32, Empty, gelu<fp32, fp32>>(input, output, e, stream);
41
42
}

43
void nvte_dgeglu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
44
                 cudaStream_t stream) {
45
  NVTE_API_CALL(nvte_dgeglu);
46
  using namespace transformer_engine;
47
48
  Empty e = {};
  dgated_act_fn<fp32, Empty, gelu<fp32, fp32>, dgelu<fp32, fp32>>(grad, input, output, e, stream);
49
}
50

51
void nvte_qgelu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
52
53
  NVTE_API_CALL(nvte_qgelu);
  using namespace transformer_engine;
54
  act_fn<fp32, Empty, qgelu<fp32, fp32>>(input, output, stream);
55
56
}

57
58
void nvte_dqgelu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
                 cudaStream_t stream) {
59
60
  NVTE_API_CALL(nvte_dqgelu);
  using namespace transformer_engine;
61
  dact_fn<fp32, Empty, dqgelu<fp32, fp32>>(grad, input, output, stream);
62
}
63

64
65
66
67
68
69
70
71
72
73
74
75
76
void nvte_quantize_dbias_dqgelu(const NVTETensor input, const NVTETensor activation_input,
                                NVTETensor output, NVTETensor dbias, NVTETensor workspace,
                                cudaStream_t stream) {
  NVTE_API_CALL(nvte_quantize_dbias_dqgelu);
  using namespace transformer_engine;

  constexpr bool IS_DBIAS = true;
  constexpr bool IS_DACT = true;

  dispatch::quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, dqgelu<fp32, fp32>>(
      input, activation_input, output, dbias, workspace, nullptr, stream);
}

77
void nvte_qgeglu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
78
79
  NVTE_API_CALL(nvte_qgeglu);
  using namespace transformer_engine;
80
81
  Empty e = {};
  gated_act_fn<fp32, Empty, qgelu<fp32, fp32>>(input, output, e, stream);
82
83
}

84
85
void nvte_dqgeglu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
                  cudaStream_t stream) {
86
87
  NVTE_API_CALL(nvte_dqgeglu);
  using namespace transformer_engine;
88
89
  Empty e = {};
  dgated_act_fn<fp32, Empty, qgelu<fp32, fp32>, dqgelu<fp32, fp32>>(grad, input, output, e, stream);
90
}