fused_dense.cpp 9.94 KB
Newer Older
1
2
3
4
// Adapted from https://github.com/NVIDIA/apex/blob/master/csrc/fused_dense.cpp
// We make it work for bfloat16
#include <torch/extension.h>
#include <torch/torch.h>
5
#include <ATen/cuda/CUDAContext.h>
6
#include <c10/cuda/CUDAGuard.h>
7
8
9
10
#include <vector>

#include <stdio.h>

Tri Dao's avatar
Tri Dao committed
11
12
#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")

13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
// https://github.com/NVIDIA/apex/blob/master/csrc/type_shim.h
// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#define DISPATCH_HALF_AND_BF16(TYPE, NAME, ...)                                \
  switch (TYPE) {                                                              \
  case at::ScalarType::Half: {                                                 \
    using scalar_t = at::Half;                                                 \
    __VA_ARGS__();                                                             \
    break;                                                                     \
  }                                                                            \
  case at::ScalarType::BFloat16: {                                             \
    using scalar_t = at::BFloat16;                                             \
    __VA_ARGS__();                                                             \
    break;                                                                     \
  }                                                                            \
  default:                                                                     \
    AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");            \
  }

template <typename T>
32
int linear_bias_wgrad_cuda(const T *input, const T *d_output, int64_t in_features, int64_t batch_size, int64_t out_features, T *d_weight, T *d_bias, void *lt_workspace, size_t workspaceSize);
33
34

template <typename T>
35
int linear_act_forward_cuda(const T *input, const T *weight, const T *bias, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, T *output, void *pre_act, void *lt_workspace, size_t workspaceSize);
36
37

template <typename T>
38
int bias_act_linear_dgrad_bgrad_cuda(const T *weight, const T *d_output, const void *pre_act, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, T *d_input, T *d_bias, void *lt_workspace, size_t workspaceSize);
39

Tri Dao's avatar
Tri Dao committed
40
std::vector<at::Tensor> linear_bias_wgrad(at::Tensor input, at::Tensor d_output, bool has_d_bias) {
41

42
43
44
  int64_t batch_size = input.size(0);
  int64_t in_features = input.size(1);
  int64_t out_features = d_output.size(1);
45

Tri Dao's avatar
Tri Dao committed
46
47
48
49
50
51
52
53
  TORCH_CHECK(input.dtype() == torch::kFloat16 || input.dtype() == torch::kBFloat16);
  TORCH_CHECK(input.dtype() == d_output.dtype());
  TORCH_CHECK(input.is_cuda());
  TORCH_CHECK(d_output.is_cuda());
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(d_output.is_contiguous());
  CHECK_SHAPE(input, batch_size, in_features);
  CHECK_SHAPE(d_output, batch_size, out_features);
54

55
56
57
58
  // Otherwise the kernel will be launched from cuda:0 device
  // Cast to char to avoid compiler warning about narrowing
  at::cuda::CUDAGuard device_guard{(char)input.get_device()};

59
60
61
  // create output/workspace tensor
  auto opts = input.options();
  auto d_weight = at::empty({out_features, in_features}, opts);
Tri Dao's avatar
Tri Dao committed
62
63
  at::Tensor d_bias;
  if (has_d_bias) {
64
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION < 11600
Tri Dao's avatar
Tri Dao committed
65
    d_bias = d_output.view({-1, out_features}).sum(0, false);
66
#else
Tri Dao's avatar
Tri Dao committed
67
    d_bias = at::empty({out_features}, opts);
68
#endif
Tri Dao's avatar
Tri Dao committed
69
  }
70
71
72
73
74
  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind setting this to 1M.
  // However, Apex sets it to 4M and TransformerEngine sets to 32M for Hopper and 4M for other GPUs
  // https://github.com/NVIDIA/TransformerEngine/blob/a0f0065498bbcfc1da78cf9e8b166f5381613fbc/transformer_engine/pytorch/module.py#L91
  size_t workspaceSize = 1024 * 1024 * (at::cuda::getCurrentDeviceProperties()->major >= 9 ? 32 : 4);
  auto lt_workspace = at::empty({static_cast<int64_t>(workspaceSize)}, opts.dtype(torch::kUInt8));
75
76
77
78
79
80
81
82
83

  DISPATCH_HALF_AND_BF16(input.scalar_type(), "linear_bias_wgrad", [&] {
    auto result = linear_bias_wgrad_cuda<scalar_t>(
        input.data_ptr<scalar_t>(),
        d_output.data_ptr<scalar_t>(),
        in_features,
        batch_size,
        out_features,
        d_weight.data_ptr<scalar_t>(),
84
85
86
        has_d_bias ? d_bias.data_ptr<scalar_t>() : nullptr,
        (void*) (lt_workspace.data_ptr()),
        workspaceSize);
Tri Dao's avatar
Tri Dao committed
87
    TORCH_CHECK(result == 0, "linear_bias_wgrad failed.");
88
89
90
91
92
  });

  return {d_weight, d_bias};
}

93
94
95
std::vector<at::Tensor> linear_act_forward(at::Tensor input, at::Tensor weight,
                                           c10::optional<at::Tensor> bias_,
                                           bool is_gelu, bool save_pre_act, int heuristic) {
96

97
98
99
  int64_t batch_size = input.size(0);
  int64_t in_features = input.size(1);
  int64_t out_features = weight.size(0);
100

Tri Dao's avatar
Tri Dao committed
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
  TORCH_CHECK(input.dtype() == torch::kFloat16 || input.dtype() == torch::kBFloat16);
  TORCH_CHECK(input.dtype() == weight.dtype());
  TORCH_CHECK(input.is_cuda());
  TORCH_CHECK(weight.is_cuda());
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(weight.is_contiguous());
  CHECK_SHAPE(input, batch_size, in_features);
  CHECK_SHAPE(weight, out_features, in_features);
  if (bias_.has_value()) {
    auto bias = bias_.value();
    TORCH_CHECK(bias.dtype() == input.dtype());
    TORCH_CHECK(bias.is_cuda());
    TORCH_CHECK(bias.is_contiguous());
    CHECK_SHAPE(bias, out_features);
  }
116

117
118
119
120
  // Otherwise the kernel will be launched from cuda:0 device
  // Cast to char to avoid compiler warning about narrowing
  at::cuda::CUDAGuard device_guard{(char)input.get_device()};

121
122
123
  // create output/workspace tensor
  auto opts = input.options();
  auto output = at::empty({batch_size, out_features}, opts);
124
125
126
127
  at::Tensor pre_act;
  // If ReLU, cuBlasLT stores a bit-mask (1 bit per element)
  if (save_pre_act) { pre_act = at::empty({batch_size, is_gelu ? out_features : out_features / 8},
                                          is_gelu ? opts : opts.dtype(torch::kUInt8)); }
128
129
130
131
132
  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind setting this to 1M.
  // However, Apex sets it to 4M and TransformerEngine sets to 32M for Hopper and 4M for other GPUs
  // https://github.com/NVIDIA/TransformerEngine/blob/a0f0065498bbcfc1da78cf9e8b166f5381613fbc/transformer_engine/pytorch/module.py#L91
  size_t workspaceSize = 1024 * 1024 * (at::cuda::getCurrentDeviceProperties()->major >= 9 ? 32 : 4);
  auto lt_workspace = at::empty({static_cast<int64_t>(workspaceSize)}, opts.dtype(torch::kUInt8));
133

134
135
  DISPATCH_HALF_AND_BF16(input.scalar_type(), "linear_act_forward", [&] {
    auto result = linear_act_forward_cuda<scalar_t>(
136
        input.data_ptr<scalar_t>(),
Tri Dao's avatar
Tri Dao committed
137
138
        weight.data_ptr<scalar_t>(),
        bias_.has_value()? bias_.value().data_ptr<scalar_t>() : nullptr,
139
140
141
        in_features,
        batch_size,
        out_features,
142
        is_gelu,
143
144
        heuristic,
        output.data_ptr<scalar_t>(),
145
146
147
        save_pre_act ? pre_act.data_ptr() : nullptr,
        (void*) (lt_workspace.data_ptr()),
        workspaceSize);
148
    TORCH_CHECK(result == 0, "linear_act_forward failed.");
149
150
151
  });

  std::vector<at::Tensor> result = {output};
152
  if (save_pre_act) { result.push_back(pre_act); };
153
154
155
  return result;
}

156
157
std::vector<at::Tensor> bias_act_linear_dgrad_bgrad(
  at::Tensor weight, at::Tensor d_output, at::Tensor pre_act, bool is_gelu, int heuristic
Tri Dao's avatar
Tri Dao committed
158
) {
159

160
161
162
  int64_t batch_size = d_output.size(0);
  int64_t out_features = d_output.size(1);
  int64_t in_features = weight.size(1);
Tri Dao's avatar
Tri Dao committed
163
164
165

  TORCH_CHECK(weight.dtype() == torch::kFloat16 || weight.dtype() == torch::kBFloat16);
  TORCH_CHECK(weight.dtype() == d_output.dtype());
166
  TORCH_CHECK(is_gelu ? (pre_act.dtype() == weight.dtype()) : (pre_act.dtype() == torch::kUInt8));
Tri Dao's avatar
Tri Dao committed
167
168
  TORCH_CHECK(weight.is_cuda());
  TORCH_CHECK(d_output.is_cuda());
169
  TORCH_CHECK(pre_act.is_cuda());
Tri Dao's avatar
Tri Dao committed
170
171
  TORCH_CHECK(weight.is_contiguous());
  TORCH_CHECK(d_output.is_contiguous());
172
  TORCH_CHECK(pre_act.is_contiguous());
Tri Dao's avatar
Tri Dao committed
173
174
  CHECK_SHAPE(weight, out_features, in_features);
  CHECK_SHAPE(d_output, batch_size, out_features);
175
176
  // If ReLU, cuBlasLT stores a bit-mask (1 bit per element)
  CHECK_SHAPE(pre_act, batch_size, is_gelu ? in_features : in_features / 8);
177

178
179
180
181
  // Otherwise the kernel will be launched from cuda:0 device
  // Cast to char to avoid compiler warning about narrowing
  at::cuda::CUDAGuard device_guard{(char)weight.get_device()};

182
  // create output/workspace tensor
Tri Dao's avatar
Tri Dao committed
183
184
  auto opts = weight.options();
  auto d_bias = at::empty({in_features}, opts);
185
  auto d_input = at::empty({batch_size, in_features}, opts);
186
187
188
189
190
  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind setting this to 1M.
  // However, Apex sets it to 4M and TransformerEngine sets to 32M for Hopper and 4M for other GPUs
  // https://github.com/NVIDIA/TransformerEngine/blob/a0f0065498bbcfc1da78cf9e8b166f5381613fbc/transformer_engine/pytorch/module.py#L91
  size_t workspaceSize = 1024 * 1024 * (at::cuda::getCurrentDeviceProperties()->major >= 9 ? 32 : 4);
  auto lt_workspace = at::empty({static_cast<int64_t>(workspaceSize)}, opts.dtype(torch::kUInt8));
191

192
193
  DISPATCH_HALF_AND_BF16(weight.scalar_type(), "bias_act_linear_dgrad_bgrad", [&] {
    auto result = bias_act_linear_dgrad_bgrad_cuda<scalar_t>(
Tri Dao's avatar
Tri Dao committed
194
195
        weight.data_ptr<scalar_t>(),
        d_output.data_ptr<scalar_t>(),
196
        pre_act.data_ptr(),
197
198
199
        in_features,
        batch_size,
        out_features,
200
        is_gelu,
201
202
        heuristic,
        d_input.data_ptr<scalar_t>(),
203
204
205
        d_bias.data_ptr<scalar_t>(),
        (void*) (lt_workspace.data_ptr()),
        workspaceSize);
206
    TORCH_CHECK(result == 0, "bias_act_linear_dgrad_bgrad failed.");
207
208
  });

Tri Dao's avatar
Tri Dao committed
209
  return {d_input, d_bias};
210
211
212
213
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("linear_bias_wgrad", &linear_bias_wgrad, "linear bias wgrad");
214
215
  m.def("linear_act_forward", &linear_act_forward, "linear gelu/relu forward");
  m.def("bias_act_linear_dgrad_bgrad", &bias_act_linear_dgrad_bgrad, "bias gelu/relu linear dgrad bgrad");
216
}