custom_all_reduce.cu 14.3 KB
Newer Older
1
2
3
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
4
#include <torch/all.h>
5
6
7

#include "custom_all_reduce.cuh"

8
9
// Fake pointer type, must match fptr_t type in ops.h.
// We use this type alias to indicate when pointers are passed in as int64_t.
10
using fptr_t = int64_t;
11
static_assert(sizeof(void*) == sizeof(fptr_t));
12

13
14
fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
                      torch::Tensor& rank_data, int64_t rank,
15
                      bool fully_connected) {
16
  int world_size = fake_ipc_ptrs.size();
zhuwenwen's avatar
zhuwenwen committed
17
  if (world_size > 16)
18
19
20
21
22
23
    throw std::invalid_argument("world size > 8 is not supported");
  if (world_size % 2 != 0)
    throw std::invalid_argument("Odd num gpus is not supported for now");
  if (rank < 0 || rank >= world_size)
    throw std::invalid_argument("invalid rank passed in");

zhuwenwen's avatar
zhuwenwen committed
24
  vllm::Signal* ipc_ptrs[16];
25
  for (int i = 0; i < world_size; i++) {
26
    ipc_ptrs[i] = reinterpret_cast<vllm::Signal*>(fake_ipc_ptrs[i]);
27
  }
28
29
  return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
                                            rank_data.numel(), rank, world_size,
30
                                            fully_connected);
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
}

/**
 * Make sure tensor t's data lies completely within ((char)t.data_ptr()) +
 * t.numel() * t.element_size(). This is slightly weaker than t.is_contiguous()
 * because it allows transpose of contiguous slice (i.e. slicing the first
 * dimension). Currently, we require this because stride information is not
 * passed into the kernels and we treat input tensors as flat.
 *
 * Examples
 * A = torch.zeros(3, 3, 3)
 * 1. A: OK
 * 2. A[1:]: OK
 * 3. A.permute(2, 0, 1): OK
 * 4. A[1:].permute(2, 0, 1): OK
 * 5. A[None].expand(2, -1, -1, -1): Not OK
 * 6. A[:, 1:, 1:]: Not OK
 */
49
bool _is_weak_contiguous(torch::Tensor& t) {
50
51
52
53
54
  return t.is_contiguous() ||
         (t.storage().nbytes() - t.storage_offset() * t.element_size() ==
          t.numel() * t.element_size());
}

55
56
57
58
59
60
61
/**
 * Performs an out-of-place allreduce and stores result in out.
 *
 * If _reg_buffer is null, assumes inp.data_ptr() is already IPC-registered.
 * Otherwise, _reg_buffer is assumed to be IPC-registered and inp is first
 * copied into _reg_buffer.
 */
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199

void all_reduce_fuse_norm(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                        int64_t hidden_size, torch::Tensor& residual, torch::Tensor& rms_weight, 
                        double eps, fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes) {
  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
  auto stream = c10::cuda::getCurrentCUDAStream().stream();

  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
  TORCH_CHECK_EQ(inp.numel(), out.numel());
  TORCH_CHECK(_is_weak_contiguous(out));
  TORCH_CHECK(_is_weak_contiguous(inp));
  TORCH_CHECK(_is_weak_contiguous(residual));
  TORCH_CHECK(_is_weak_contiguous(rms_weight));
  int token_num = inp.numel() / hidden_size;

  auto input_size = inp.numel() * inp.element_size();
  auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
  if (reg_buffer) {
    TORCH_CHECK_LE(input_size, reg_buffer_sz_bytes);
    AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer, inp.data_ptr(), input_size,
                                  cudaMemcpyDeviceToDevice, stream));
  } else {
    reg_buffer = inp.data_ptr();
  }
  switch (out.scalar_type()) {
    case at::ScalarType::Float: {
      fa->allreduce_fuse_norm<float>(stream, reinterpret_cast<float*>(reg_buffer),
                          reinterpret_cast<float*>(out.data_ptr()),out.numel(),
                          token_num, hidden_size, reinterpret_cast<float*>(residual.data_ptr()),
                          reinterpret_cast<float*>(rms_weight.data_ptr()), (float)eps);
      break;
    }
    case at::ScalarType::Half: {
      fa->allreduce_fuse_norm<half>(stream, reinterpret_cast<half*>(reg_buffer),
                          reinterpret_cast<half*>(out.data_ptr()),out.numel(),
                          token_num, hidden_size, reinterpret_cast<half*>(residual.data_ptr()),
                          reinterpret_cast<half*>(rms_weight.data_ptr()), (float)eps);
      break;
    }
// #if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
    case at::ScalarType::BFloat16: {
      fa->allreduce_fuse_norm<nv_bfloat16>(stream, reinterpret_cast<nv_bfloat16*>(reg_buffer),
                          reinterpret_cast<nv_bfloat16*>(out.data_ptr()),out.numel(),
                          token_num, hidden_size, reinterpret_cast<nv_bfloat16*>(residual.data_ptr()),
                          reinterpret_cast<nv_bfloat16*>(rms_weight.data_ptr()), (float)eps);
      break;
    }
// #endif
    default:
      throw std::runtime_error(
          "custom allreduce only supports float32, float16 and bfloat16");
  }
}


template<typename scalar_in_t, bool update_input>
void allreduce_fuse_norm_quant_dispath(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, 
                                  int hidden_size,torch::Tensor& rms_weight, double eps, 
                                  torch::Tensor& scales, torch::Tensor& norm_out, 
                                  fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes,
                                  std::optional<at::Tensor> residual) {
  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
  auto stream = c10::cuda::getCurrentCUDAStream().stream();
  TORCH_CHECK(_is_weak_contiguous(inp));

  int token_num = inp.numel() / hidden_size;

  auto input_size = inp.numel() * inp.element_size();
  auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
  if (reg_buffer) {
    TORCH_CHECK_LE(input_size, reg_buffer_sz_bytes);
    AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer, inp.data_ptr(), input_size,
                                  cudaMemcpyDeviceToDevice, stream));
  } else {
    reg_buffer = inp.data_ptr();
  }
  auto wt_ptr = reinterpret_cast<std::uintptr_t>(rms_weight.data_ptr());
  if (wt_ptr % 16 != 0) {
        throw std::runtime_error(
            "custom allreduce currently requires wt_ptr % 16 "
            "of " +
            std::to_string(wt_ptr % 16));
  }
  if (fa->fully_connected_) {
    if (residual.has_value()) {
      VLLM_DISPATCH_QUANT_TYPES(
        out.scalar_type(), "fa->allreduce_fuse_norm_quant", [&] {
          fa->allreduce_fuse_norm_quant<scalar_in_t, scalar_t, true, update_input>
            (stream, reinterpret_cast<scalar_in_t*>(reg_buffer), out.data_ptr<scalar_t>(),
            out.numel(), token_num, hidden_size, residual->data_ptr<scalar_in_t>(),
            rms_weight.data_ptr<scalar_in_t>(),
            norm_out.data_ptr<scalar_in_t>(),
            eps, scales.data_ptr<float>());
        });
    } else {
      VLLM_DISPATCH_QUANT_TYPES(
        out.scalar_type(), "fa->allreduce_fuse_norm_quant", [&] {
          fa->allreduce_fuse_norm_quant<scalar_in_t, scalar_t, false, update_input>
            (stream, reinterpret_cast<scalar_in_t*>(reg_buffer), out.data_ptr<scalar_t>(),
            out.numel(), token_num, hidden_size, nullptr,
            rms_weight.data_ptr<scalar_in_t>(),
            norm_out.data_ptr<scalar_in_t>(),
            eps, scales.data_ptr<float>());
        });
    }
  } else {
        throw std::runtime_error(
            "custom allreduce only supports fully_connected");
  }
}
void all_reduce_fuse_norm_quant(fptr_t fa, torch::Tensor& inp, torch::Tensor& out, 
                          int64_t hidden_size,torch::Tensor& rms_weight, double eps, 
                          torch::Tensor& scales, torch::Tensor& norm_out,
                          fptr_t reg_buffer, int64_t reg_buffer_sz_bytes,
                          std::optional<at::Tensor> residual, bool update_input) {

  static c10::ScalarType kFp8Type = c10::ScalarType::Float8_e4m3fn;

  TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
  TORCH_CHECK(scales.dtype() == torch::kFloat32);

  TORCH_CHECK_EQ(inp.numel(), out.numel());
  TORCH_CHECK(out.is_contiguous() && inp.is_contiguous());
  VLLM_DISPATCH_FLOATING_TYPES(
        inp.scalar_type(), "allreduce_fuse_norm_quant_dispath", [&] {
          if (update_input)
            allreduce_fuse_norm_quant_dispath<scalar_t, true>(
                fa, inp, out, hidden_size, rms_weight, eps, scales, norm_out,
                reg_buffer, reg_buffer_sz_bytes, residual);
          else 
            allreduce_fuse_norm_quant_dispath<scalar_t, false>(
                fa, inp, out, hidden_size, rms_weight, eps, scales, norm_out,
                reg_buffer, reg_buffer_sz_bytes, residual);
        });
}

200
201
void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes) {
202
  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
203
204
205
206
207
  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
  auto stream = c10::cuda::getCurrentCUDAStream().stream();

  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
  TORCH_CHECK_EQ(inp.numel(), out.numel());
208
  TORCH_CHECK(_is_weak_contiguous(out));
209
210
211
212
213
214
215
216
217
218
  TORCH_CHECK(_is_weak_contiguous(inp));
  auto input_size = inp.numel() * inp.element_size();
  auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
  if (reg_buffer) {
    TORCH_CHECK_LE(input_size, reg_buffer_sz_bytes);
    AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer, inp.data_ptr(), input_size,
                                  cudaMemcpyDeviceToDevice, stream));
  } else {
    reg_buffer = inp.data_ptr();
  }
zhuwenwen's avatar
zhuwenwen committed
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
  if (fa->fully_connected_) {
    switch (out.scalar_type()) {
      case at::ScalarType::Float: {
        fa->allreduce<float>(stream, reinterpret_cast<float*>(reg_buffer),
                            reinterpret_cast<float*>(out.data_ptr()),
                            out.numel());
        break;
      }
      case at::ScalarType::Half: {
        fa->allreduce<half>(stream, reinterpret_cast<half*>(reg_buffer),
                            reinterpret_cast<half*>(out.data_ptr()), out.numel());
        break;
      }
  // #if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
      case at::ScalarType::BFloat16: {
        fa->allreduce<nv_bfloat16>(
            stream, reinterpret_cast<nv_bfloat16*>(reg_buffer),
            reinterpret_cast<nv_bfloat16*>(out.data_ptr()), out.numel());
        break;
      }
  // #endif
      default:
        throw std::runtime_error(
            "custom allreduce only supports float32, float16 and bfloat16");
243
    }
zhuwenwen's avatar
zhuwenwen committed
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
  } else {
    switch (out.scalar_type()) {
      case at::ScalarType::Float: {
        fa->allreduce_pcie<float>(stream, reinterpret_cast<float*>(reg_buffer),
                            reinterpret_cast<float*>(out.data_ptr()),
                            out.numel());
        break;
      }
      case at::ScalarType::Half: {
        fa->allreduce_pcie<half>(stream, reinterpret_cast<half*>(reg_buffer),
                            reinterpret_cast<half*>(out.data_ptr()), out.numel());
        break;
      }
  // #if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
      case at::ScalarType::BFloat16: {
        fa->allreduce_pcie<nv_bfloat16>(
            stream, reinterpret_cast<nv_bfloat16*>(reg_buffer),
            reinterpret_cast<nv_bfloat16*>(out.data_ptr()), out.numel());
        break;
      }
  // #endif
      default:
        throw std::runtime_error(
            "custom allreduce only supports float32, float16 and bfloat16");
268
269
270
271
272
    }
  }
}

void dispose(fptr_t _fa) {
273
  delete reinterpret_cast<vllm::CustomAllreduce*>(_fa);
274
275
}

276
int64_t meta_size() { return sizeof(vllm::Signal); }
277

278
void register_buffer(fptr_t _fa, const std::vector<fptr_t>& fake_ipc_ptrs) {
279
  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
280
  TORCH_CHECK(fake_ipc_ptrs.size() == fa->world_size_);
zhuwenwen's avatar
zhuwenwen committed
281
  void* ipc_ptrs[16];
282
283
284
285
  for (int i = 0; i < fake_ipc_ptrs.size(); i++) {
    ipc_ptrs[i] = reinterpret_cast<void*>(fake_ipc_ptrs[i]);
  }
  fa->register_buffer(ipc_ptrs);
286
287
}

288
289
290
// Use vector<int64_t> to represent byte data for python binding compatibility.
std::tuple<std::vector<int64_t>, std::vector<int64_t>>
get_graph_buffer_ipc_meta(fptr_t _fa) {
291
  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
292
293
294
  auto [handle, offsets] = fa->get_graph_buffer_ipc_meta();
  std::vector<int64_t> bytes(handle.begin(), handle.end());
  return std::make_tuple(bytes, offsets);
295
296
}

297
298
299
// Use vector<int64_t> to represent byte data for python binding compatibility.
void register_graph_buffers(fptr_t _fa,
                            const std::vector<std::vector<int64_t>>& handles,
300
301
                            const std::vector<std::vector<int64_t>>& offsets) {
  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
302
303
304
305
306
307
308
  std::vector<std::string> bytes;
  bytes.reserve(handles.size());
  for (int i = 0; i < handles.size(); i++) {
    bytes.emplace_back(handles[i].begin(), handles[i].end());
  }
  bytes.reserve(handles.size());
  fa->register_graph_buffers(bytes, offsets);
309
}
zhuwenwen's avatar
zhuwenwen committed
310
311

std::tuple<fptr_t, torch::Tensor> allocate_shared_buffer_and_handle(
312
313
314
315
316
317
318
319
320
    int64_t size) {
  auto device_index = c10::cuda::current_device();
  at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
  void* buffer;
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  auto stream = c10::cuda::getCurrentCUDAStream().stream();
  AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));

  // Allocate buffer
zhuwenwen's avatar
zhuwenwen committed
321
#if defined(USE_ROCM)
322
323
324
  // data buffers need to be "uncached" for signal on MI200
  AT_CUDA_CHECK(
      hipExtMallocWithFlags((void**)&buffer, size, hipDeviceMallocUncached));
zhuwenwen's avatar
zhuwenwen committed
325
#else
326
  AT_CUDA_CHECK(cudaMalloc((void**)&buffer, size));
zhuwenwen's avatar
zhuwenwen committed
327
#endif
328
329
330
331
332
333
334
335
336
337
338
339
340
341
  AT_CUDA_CHECK(cudaMemsetAsync(buffer, 0, size, stream));
  AT_CUDA_CHECK(cudaStreamSynchronize(stream));
  AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));

  // Create IPC memhandle for the allocated buffer.
  // Will use it in open_mem_handle.
  auto options =
      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
  auto handle =
      torch::empty({static_cast<int64_t>(sizeof(cudaIpcMemHandle_t))}, options);
  AT_CUDA_CHECK(
      cudaIpcGetMemHandle((cudaIpcMemHandle_t*)handle.data_ptr(), buffer));

  return std::make_tuple(reinterpret_cast<fptr_t>(buffer), handle);
zhuwenwen's avatar
zhuwenwen committed
342
}
343

zhuwenwen's avatar
zhuwenwen committed
344
345
346
347
348
349
350
351
352
353
fptr_t open_mem_handle(torch::Tensor& mem_handle) {
  void* ipc_ptr;
  AT_CUDA_CHECK(cudaIpcOpenMemHandle(
      (void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)mem_handle.data_ptr()),
      cudaIpcMemLazyEnablePeerAccess));
  return reinterpret_cast<fptr_t>(ipc_ptr);
}

void free_shared_buffer(fptr_t buffer) {
  AT_CUDA_CHECK(cudaFree(reinterpret_cast<void*>(buffer)));
zhuwenwen's avatar
zhuwenwen committed
354
}