Merge tag 'v0.6.5' into v0.6.5-dev

4d3a2c28 · zhuwenwen · 92ec5d8e · 2d1b9baa · 4d3a2c28 · 4d3a2c28
Commit 4d3a2c28 authored Dec 30, 2024 by zhuwenwen
20 changed files
--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@@ -28,6 +28,7 @@
 #include "common/base.h"
 #include "core/scalar_type.hpp"
+#include "core/registration.h"
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
@@ -88,7 +89,7 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                  torch::Tensor& b_meta,
                                  torch::Tensor& b_scales,
                                  torch::Tensor& workspace,
-                                  vllm::ScalarTypeTorchPtr const& b_q_type,
+                                  vllm::ScalarTypeId const b_q_type_id,
                                  int64_t size_m, int64_t size_n,
                                  int64_t size_k) {
  TORCH_CHECK_NOT_IMPLEMENTED(
@@ -295,13 +296,9 @@ __global__ void Marlin_24(
  // We use a different scale layout for grouped and column-wise quantization as
  // we scale a `half2` tile in column-major layout in the former and in
  // row-major in the latter case.
-  if (group_blocks != -1) {
+  s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+            (threadIdx.x % 32) / 4;  // Note that in the original Marlin kernel
-              (threadIdx.x % 32) / 4;
+                                     // this is (threadIdx.x % 32) / 4
-  } else {
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-  }
  // Precompute which thread should not read memory in which iterations; this is
  // needed if there are more threads than required for a certain tilesize or
@@ -909,13 +906,16 @@ void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C,
      // than better compute utilization
      thread_k = 128;
      thread_m = 128;
-    } else if (prob_n <= 256) {
+    } else {
      thread_k = 64;
      thread_m = 256;
-    } else {
-      thread_k = 32;
-      thread_m = 512;
    }
+    // Also had
+    // if prob_n > 256
+    //   thread_k = 32;
+    //   thread_m = 512;
+    // but this is broken,
+    // TODO(Lucas, Alex M): figure out why
  }
  int thread_k_blocks = thread_k / 32;  // 2:4 version with m16n8k32 instruction
@@ -1028,13 +1028,14 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                  torch::Tensor& b_meta,
                                  torch::Tensor& b_scales,
                                  torch::Tensor& workspace,
-                                  vllm::ScalarTypeTorchPtr const& b_q_type,
+                                  vllm::ScalarTypeId const b_q_type_id,
                                  int64_t size_m, int64_t size_n,
                                  int64_t size_k) {
+  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
  // Verify num_bits
-  TORCH_CHECK(*b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128,
+  TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128,
-              "num_bits must be uint4b8 or uint8b128. Got = ", b_q_type->str());
+              "num_bits must be uint4b8 or uint8b128. Got = ", b_q_type.str());
-  int pack_factor = 32 / b_q_type->size_bits();
+  int pack_factor = 32 / b_q_type.size_bits();
  // Verify M
  TORCH_CHECK(size_m == a.size(0),
@@ -1077,6 +1078,8 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
  // Verify A device and strides
  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+  TORCH_CHECK(a.dtype() == torch::kFloat16,
+              "A is not float16, currently only float16 is supported");
  // Verify B device and strides
  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
@@ -1089,6 +1092,8 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
  // Verify scales device and strides
  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat16,
+              "A is not float16, currently only float16 is supported");
  // Alloc C matrix
  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
@@ -1129,8 +1134,12 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
  marlin_24::marlin_cuda_2_4(
      a.data_ptr(), b_q_weight.data_ptr(), b_meta.data_ptr(), c.data_ptr(),
      b_scales.data_ptr(), size_n, size_m, size_k, workspace.data_ptr(),
-      b_q_type->size_bits(), groupsize, dev,
+      b_q_type.size_bits(), groupsize, dev, at::cuda::getCurrentCUDAStream(dev),
-      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_m, sms, max_par);
+      thread_k, thread_m, sms, max_par);
  return c;
 }
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("gptq_marlin_24_gemm", &gptq_marlin_24_gemm);
+}
--- a/csrc/quantization/vectorization.cuh
+++ b/csrc/quantization/vectorization.cuh
+#pragma once
+/**
+ * __device__ datatypes vectorized by 4
+ */
+// Include both AMD and NVIDIA fp8 types to avoid circular import
+// TODO(luka/varun) use FP8_TYPE instead after refactoring
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e4m3fn.h>
+namespace vllm {
+// Vectorization containers
+template <typename scalar_t>
+struct __align__(8) vec4_t {
+  scalar_t x;
+  scalar_t y;
+  scalar_t z;
+  scalar_t w;
+};
+template <typename quant_type_t>
+struct __align__(4) q8x4_t {
+  static_assert(std::is_same_v<quant_type_t, int8_t> ||
+                std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
+                std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
+  quant_type_t x;
+  quant_type_t y;
+  quant_type_t z;
+  quant_type_t w;
+};
+}  // namespace vllm
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -18,6 +18,9 @@
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // vLLM custom ops
+  ops.def("weak_ref_tensor(Tensor input) -> Tensor");
+  ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
  // Attention ops
  // Compute the attention between an input query and the cached
  // keys/values using PagedAttention.
@@ -225,6 +228,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def("gelu_tanh_and_mul_opt(Tensor! out, Tensor input) -> ()");
  ops.impl("gelu_tanh_and_mul_opt", torch::kCUDA, &gelu_tanh_and_mul);
+  // FATReLU implementation.
+  ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()");
+  ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul);
  // GELU implementation used in GPT-2.
  ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
  ops.impl("gelu_new", torch::kCUDA, &gelu_new);
@@ -259,7 +266,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Layernorm
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
  ops.def(
-      "rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> "
+      "rms_norm(Tensor! result, Tensor input, Tensor weight, float epsilon) -> "
      "()");
  ops.impl("rms_norm", torch::kCUDA, &rms_norm);
@@ -281,6 +288,31 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "float epsilon) -> ()");
  ops.impl("fused_add_rms_norm_opt", torch::kCUDA, &fused_add_rms_norm_opt);
+  // Layernorm-quant
+  // Apply Root Mean Square (RMS) Normalization to the input tensor.
+//   ops.def(
+//       "rms_norm_static_fp8_quant(Tensor! result, Tensor input, Tensor weight, "
+//       "Tensor scale, float epsilon) -> "
+//       "()");
+//   ops.impl("rms_norm_static_fp8_quant", torch::kCUDA,
+//            &rms_norm_static_fp8_quant);
+  // In-place fused Add and RMS Normalization.
+//   ops.def(
+//       "fused_add_rms_norm_static_fp8_quant(Tensor! result, Tensor input, "
+//       "Tensor! residual, Tensor weight, "
+//       "Tensor scale, float epsilon) -> ()");
+//   ops.impl("fused_add_rms_norm_static_fp8_quant", torch::kCUDA,
+//            &fused_add_rms_norm_static_fp8_quant);
+  // Fused Layernorm + Quant kernels
+  ops.def(
+      "rms_norm_dynamic_per_token_quant(Tensor! result, Tensor input, "
+      "Tensor weight, Tensor! scale, float epsilon, "
+      "Tensor? scale_ub, Tensor!? residual) -> ()");
+  ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
+           &rms_norm_dynamic_per_token_quant);
  // Rotary embedding
  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
  ops.def(
@@ -330,13 +362,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Quantized GEMM for AWQ.
  ops.def(
      "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
-      "Tensor _zeros, int split_k_iters) -> Tensor");
+      "Tensor _zeros, SymInt split_k_iters) -> Tensor");
  ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
  // Dequantization for AWQ.
  ops.def(
      "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
-      "Tensor _zeros, int split_k_iters, int thx, int thy) -> Tensor");
+      "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor");
  ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
  // Note about marlin kernel 'workspace' arguments:
@@ -356,31 +388,50 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Marlin (Dense) Optimized Quantized GEMM for GPTQ.
  ops.def(
      "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
-      "Tensor! workspace, int size_m, int size_n, int size_k) -> Tensor");
+      "Tensor! workspace, SymInt size_m, SymInt size_n, SymInt size_k) -> "
-  ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm);
+      "Tensor");
+  // conditionally compiled so impl in source file
  // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
  ops.def(
      "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
      "Tensor b_scales, Tensor workspace, "
-      "__torch__.torch.classes._core_C.ScalarType b_q_type, "
+      "int b_q_type, "
-      "int size_m, int size_n, int size_k) -> Tensor");
+      "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor");
-  ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm);
+  //  conditionally compiled so impl in source file
  // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
-  ops.def("machete_supported_schedules", &machete::supported_schedules);
  ops.def(
-      "machete_gemm(Tensor A, Tensor B,"
+      "machete_supported_schedules("
-      "             __torch__.torch.classes._core_C.ScalarType btype,"
+      "   ScalarType a_type,"
-      "             Tensor? scales, Tensor? zeros, int? group_size,"
+      "   int b_type,"
-      "             Tensor? C, float? alpha, float? beta, str? schedule)"
+      "   ScalarType? maybe_group_scales_type,"
-      "-> Tensor");
+      "   ScalarType? maybe_group_zeros_type,"
-  ops.impl("machete_gemm", torch::kCUDA, &machete::gemm);
+      "   ScalarType? maybe_channel_scales_type,"
-  ops.def(
+      "   ScalarType? maybe_token_scales_type,"
-      "machete_prepack_B(Tensor B,"
+      "   ScalarType? maybe_out_type"
-      "                  __torch__.torch.classes._core_C.ScalarType btype)"
+      ") -> str[]");
-      "-> Tensor");
+  ops.def(
-  ops.impl("machete_prepack_B", torch::kCUDA, &machete::prepack_B);
+      "machete_mm("
+      "   Tensor A,"
+      "   Tensor B,"
+      "   int b_type,"
+      "   ScalarType? out_type,"
+      "   Tensor? group_scales,"
+      "   Tensor? group_zeros,"
+      "   int?    group_size,"
+      "   Tensor? channel_scales,"
+      "   Tensor? token_scales,"
+      "   str?    schedule"
+      ") -> Tensor");
+  ops.def(
+      "machete_prepack_B("
+      "   Tensor B,"
+      "   ScalarType a_type,"
+      "   int b_type,"
+      "   ScalarType? group_scales_type"
+      ") -> Tensor");
+  // conditionally compiled so impl registration is in source file
  ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
  ops.impl("permute_cols", torch::kCUDA, &permute_cols);
@@ -389,53 +440,54 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def(
      "gptq_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
      "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
-      "__torch__.torch.classes._core_C.ScalarType b_q_type, "
+      "int b_q_type, "
-      "int size_m, int size_n, int size_k, bool is_k_full, "
+      "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
-      "bool has_zp, bool use_fp32_reduce) -> Tensor");
+      "bool has_zp, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
-  ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm);
+  // conditionally compiled so impl registration is in source file
  // gptq_marlin repack from GPTQ.
  ops.def(
      "gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
      "SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
-  ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
+  // conditionally compiled so impl registrations are in source file
-  ops.impl("gptq_marlin_repack", torch::kMeta, &gptq_marlin_repack_meta);
  // awq_marlin repack from AWQ.
  ops.def(
      "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
      "SymInt size_n, int num_bits) -> Tensor");
-  ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack);
+  // conditionally compiled so impl registrations are in source file
-  ops.impl("awq_marlin_repack", torch::kMeta, &awq_marlin_repack_meta);
+#endif
  // Dequantization for GGML.
-  ops.def("ggml_dequantize(Tensor W, int type, int m, int n) -> Tensor");
+  ops.def("ggml_dequantize(Tensor W, int type, SymInt m, SymInt n) -> Tensor");
  ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
  // mmvq kernel for GGML.
  ops.def(
-      "ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, int row) "
+      "ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, SymInt row) "
      "-> Tensor");
  ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
  // mmq kernel for GGML.
-  ops.def("ggml_mul_mat_a8(Tensor W, Tensor X, int type, int row) -> Tensor");
+  ops.def(
+      "ggml_mul_mat_a8(Tensor W, Tensor X, int type, SymInt row) -> Tensor");
  ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
+#ifndef USE_ROCM
  // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
  ops.def(
      "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
-      "Tensor! workspace, int num_bits, int size_m, int size_n, "
+      "Tensor! workspace, int num_bits, SymInt size_m, SymInt size_n, "
-      "int size_k) -> Tensor");
+      "SymInt size_k) -> Tensor");
-  ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
+  // conditionally compiled so impl registration is in source file
  // marlin_qqq_gemm for QQQ.
  ops.def(
      "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
      "Tensor s_tok, Tensor s_ch, Tensor s_group, "
-      "Tensor! workspace, int size_m, int size_n, "
+      "Tensor! workspace, SymInt size_m, SymInt size_n, "
-      "int size_k) -> Tensor");
+      "SymInt size_k) -> Tensor");
-  ops.impl("marlin_qqq_gemm", torch::kCUDA, &marlin_qqq_gemm);
+  // conditionally compiled so impl registration is in source file
  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
  // quantization, as well as bias
@@ -463,27 +515,35 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def(
      "selective_scan_fwd(Tensor! u, Tensor! delta,"
      "Tensor! A, Tensor! B, Tensor! C,"
-      "Tensor? D_, Tensor? z_, Tensor? delta_bias_,"
+      "Tensor? D_, Tensor!? z_, Tensor? delta_bias_,"
      "bool delta_softplus,"
-      "Tensor? index_, Tensor!? x) -> Tensor[]");
+      "Tensor? query_start_loc,"
+      "Tensor? cache_indices,"
+      "Tensor? has_initial_state,"
+      "Tensor! ssm_states,"
+      "int pad_slot_id) -> ()");
  ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
  ops.def(
      "causal_conv1d_update(Tensor! x,"
      "Tensor! conv_state,"
      "Tensor! weight,"
-      "Tensor? bias,"
+      "Tensor? bias_,"
      "bool silu_activation,"
-      "Tensor? conv_state_indices) -> Tensor");
+      "Tensor? cache_seqlens_,"
+      "Tensor? conv_state_indices,"
+      "int pad_slot_id) -> ()");
  ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
  ops.def(
      "causal_conv1d_fwd(Tensor! x, Tensor! weight,"
      "Tensor? bias_,"
-      "Tensor? seq_idx_,"
+      "Tensor!? conv_states,"
-      "Tensor? initial_states_,"
+      "Tensor? query_start_loc,"
-      "Tensor!? final_states_out_,"
+      "Tensor? cache_indices,"
-      "bool silu_activation) -> Tensor");
+      "Tensor? has_initial_state,"
+      "bool silu_activation,"
+      "int pad_slot_id) -> ()");
  ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
 #endif
@@ -502,42 +562,34 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Compute FP8 quantized tensor for given scaling factor.
 //   ops.def(
-//       "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");
+//       "static_scaled_fp8_quant(Tensor! result, Tensor input, Tensor scale) -> "
+//       "()");
 //   ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);
+//   // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
-  // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
 //   ops.def(
-//       "dynamic_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
+//       "dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) "
+//       "-> "
 //       "()");
 //   ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
-  // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
+//   // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
 //   ops.def(
-//       "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, "
+//       "dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, "
 //       "Tensor! scale, Tensor? scale_ub) -> "
 //       "()");
 //   ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
 //            &dynamic_per_token_scaled_fp8_quant);
-  // Aligning the number of tokens to be processed by each expert such
-  // that it is divisible by the block size.
-  ops.def(
-      "moe_align_block_size(Tensor topk_ids, int num_experts,"
-      "                     int block_size, Tensor! sorted_token_ids,"
-      "                     Tensor! experts_ids,"
-      "                     Tensor! num_tokens_post_pad) -> ()");
-  ops.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
  // Compute int8 quantized tensor for given scaling factor.
  ops.def(
-      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
+      "static_scaled_int8_quant(Tensor! result, Tensor input, Tensor scale,"
      "Tensor? azp) -> ()");
  ops.impl("static_scaled_int8_quant", torch::kCUDA, &static_scaled_int8_quant);
  // Compute int8 quantized tensor and scaling factor
  ops.def(
-      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
+      "dynamic_scaled_int8_quant(Tensor! result, Tensor input, Tensor! scale, "
      "Tensor!? azp) -> ()");
  ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
           &dynamic_scaled_int8_quant);
@@ -617,27 +669,18 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
  // Custom all-reduce kernels
  custom_ar.def(
-      "init_custom_ar(Tensor meta, Tensor rank_data, "
+      "init_custom_ar(int[] ipc_tensors, Tensor rank_data, "
-      "str[] handles, int[] offsets, int rank, "
+      "int rank, bool full_nvlink) -> int");
-      "bool full_nvlink) -> int");
  custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
-  custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
-  custom_ar.impl("all_reduce_reg", torch::kCUDA, &all_reduce_reg);
  custom_ar.def(
-      "all_reduce_unreg(int fa, Tensor inp, Tensor reg_buffer, Tensor! out) -> "
+      "all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, "
-      "()");
+      "int reg_buffer_sz_bytes) -> ()");
-  custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
+  custom_ar.impl("all_reduce", torch::kCUDA, &all_reduce);
  custom_ar.def("dispose", &dispose);
  custom_ar.def("meta_size", &meta_size);
-  custom_ar.def(
+  custom_ar.def("register_buffer", &register_buffer);
-      "register_buffer(int fa, Tensor t, str[] handles, "
-      "int[] offsets) -> ()");
-  custom_ar.impl("register_buffer", torch::kCUDA, &register_buffer);
  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
  custom_ar.def("register_graph_buffers", &register_graph_buffers);
 }

--- a/csrc/type_convert.cuh
+++ b/csrc/type_convert.cuh
+#pragma once
+#include <torch/all.h>
+#ifndef USE_ROCM
+  #include <cuda_bf16.h>
+  #include <cuda_fp16.h>
+#else
+  #include <hip/hip_bf16.h>
+  #include <hip/hip_fp16.h>
+using __nv_bfloat16 = __hip_bfloat16;
+using __nv_bfloat162 = __hip_bfloat162;
+#endif
+namespace vllm {
+/* Converter structs for the conversion from torch types to HIP/CUDA types,
+   and the associated type conversions within HIP/CUDA. These helpers need
+   to be implemented for now because the relevant type conversion
+   operators/constructors are not consistently implemented by HIP/CUDA, so
+   a generic conversion via type casts cannot be implemented.
+   Each struct should have the member static constexpr bool `exists`:
+   If false, the optimized kernel is not used for the corresponding torch type.
+   If true, the struct should be fully defined as shown in the examples below.
+ */
+template <typename torch_type>
+struct _typeConvert {
+  static constexpr bool exists = false;
+};
+#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
+// CUDA < 12.0 runs into issues with packed type conversion
+template <>
+struct _typeConvert<c10::Half> {
+  static constexpr bool exists = true;
+  using hip_type = __half;
+  using packed_hip_type = __half2;
+  __device__ static inline float convert(hip_type x) { return __half2float(x); }
+  __device__ static inline float2 convert(packed_hip_type x) {
+    return __half22float2(x);
+  }
+  __device__ static inline hip_type convert(float x) {
+    return __float2half_rn(x);
+  }
+  __device__ static inline packed_hip_type convert(float2 x) {
+    return __float22half2_rn(x);
+  }
+};
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+// CUDA_ARCH < 800 does not have BF16 support
+// TODO: Add in ROCm support once public headers handle bf16 maturely
+template <>
+struct _typeConvert<c10::BFloat16> {
+  static constexpr bool exists = true;
+  using hip_type = __nv_bfloat16;
+  using packed_hip_type = __nv_bfloat162;
+  __device__ static inline float convert(hip_type x) {
+    return __bfloat162float(x);
+  }
+  __device__ static inline float2 convert(packed_hip_type x) {
+    return __bfloat1622float2(x);
+  }
+  __device__ static inline hip_type convert(float x) {
+    return __float2bfloat16(x);
+  }
+  __device__ static inline packed_hip_type convert(float2 x) {
+    return __float22bfloat162_rn(x);
+  }
+};
+  #endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#endif    // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >=
+          // 12000))
+/* Vector POD struct to generate vectorized and packed FP16/BF16 ops
+   for appropriate specializations of fused_add_rms_norm_kernel.
+   Only functions that are necessary in that kernel are implemented.
+   Alignment to 16 bytes is required to use 128-bit global memory ops.
+ */
+template <typename scalar_t, int width>
+struct alignas(16) _f16Vec {
+  /* Not theoretically necessary that width is a power of 2 but should
+     almost always be the case for optimization purposes */
+  static_assert(width > 0 && (width & (width - 1)) == 0,
+                "Width is not a positive power of 2!");
+  using Converter = _typeConvert<scalar_t>;
+  using T1 = typename Converter::hip_type;
+  using T2 = typename Converter::packed_hip_type;
+  T1 data[width];
+  __device__ _f16Vec& operator+=(const _f16Vec<scalar_t, width>& other) {
+    if constexpr (width % 2 == 0) {
+#pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        T2 temp{data[i], data[i + 1]};
+        temp += T2{other.data[i], other.data[i + 1]};
+        data[i] = temp.x;
+        data[i + 1] = temp.y;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < width; ++i) data[i] += other.data[i];
+    }
+    return *this;
+  }
+  __device__ _f16Vec& operator*=(const _f16Vec<scalar_t, width>& other) {
+    if constexpr (width % 2 == 0) {
+#pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        T2 temp{data[i], data[i + 1]};
+        temp *= T2{other.data[i], other.data[i + 1]};
+        data[i] = temp.x;
+        data[i + 1] = temp.y;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < width; ++i) data[i] *= other.data[i];
+    }
+    return *this;
+  }
+  __device__ _f16Vec& operator*=(const float scale) {
+    if constexpr (width % 2 == 0) {
+#pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        float2 temp_f = Converter::convert(T2{data[i], data[i + 1]});
+        temp_f.x *= scale;
+        temp_f.y *= scale;
+        T2 temp = Converter::convert(temp_f);
+        data[i] = temp.x;
+        data[i + 1] = temp.y;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < width; ++i) {
+        float temp = Converter::convert(data[i]) * scale;
+        data[i] = Converter::convert(temp);
+      }
+    }
+    return *this;
+  }
+  __device__ float sum_squares() const {
+    float result = 0.0f;
+    if constexpr (width % 2 == 0) {
+#pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        float2 z = Converter::convert(T2{data[i], data[i + 1]});
+        result += z.x * z.x + z.y * z.y;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < width; ++i) {
+        float x = Converter::convert(data[i]);
+        result += x * x;
+      }
+    }
+    return result;
+  }
+};
+}  // namespace vllm
\ No newline at end of file
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -4,6 +4,7 @@ sphinx-copybutton==0.5.2
 myst-parser==2.0.0
 sphinx-argparse==0.4.0
 msgspec
+cloudpickle
 # packages to install to build the documentation
 pydantic >= 2.8
@@ -11,5 +12,10 @@ pydantic >= 2.8
 torch
 py-cpuinfo
 transformers
-mistral_common >= 1.3.4
+mistral_common >= 1.5.0
-openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
+aiohttp
\ No newline at end of file
+starlette
+openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
+fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
+partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
+requests
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -9,6 +9,8 @@ document.addEventListener("DOMContentLoaded", function () {
    script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
    script.setAttribute("runllm-name", "vLLM");
    script.setAttribute("runllm-position", "BOTTOM_RIGHT");
+    script.setAttribute("runllm-position-y", "20%");
+    script.setAttribute("runllm-position-x", "3%");
    script.setAttribute("runllm-assistant-id", "207");
    script.async = true;

--- a/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png
+++ b/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png
--- a/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png
+++ b/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png
--- a/docs/source/assets/design/hierarchy.png
+++ b/docs/source/assets/design/hierarchy.png
--- a/docs/source/assets/usage/disagg_prefill/abstraction.jpg
+++ b/docs/source/assets/usage/disagg_prefill/abstraction.jpg
--- a/docs/source/assets/usage/disagg_prefill/overview.jpg
+++ b/docs/source/assets/usage/disagg_prefill/overview.jpg
--- a/docs/source/automatic_prefix_caching/details.md
+++ b/docs/source/automatic_prefix_caching/details.md
@@ -25,7 +25,7 @@ With this mapping, we can add another indirection in vLLM’s KV cache managemen
 This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system.
-# Generalized Caching Policy
+## Generalized Caching Policy
 Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full.

--- a/docs/source/community/meetups.rst
+++ b/docs/source/community/meetups.rst
@@ -5,6 +5,7 @@ vLLM Meetups
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
+- `The seventh vLLM meetup <https://lu.ma/h0qvrajz>`__, with Snowflake, November 14th 2024. `[Slides] <https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing>`__
 - `The sixth vLLM meetup <https://lu.ma/87q3nvnh>`__, with NVIDIA, September 9th 2024. `[Slides] <https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing>`__
 - `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
 - `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__

--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -15,6 +15,7 @@ vLLM is a community project. Our compute resources for development and testing a
 - Dropbox
 - Google Cloud
 - Lambda Lab
+- Nebius
 - NVIDIA
 - Replicate
 - Roblox

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -10,11 +10,13 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
+import inspect
 import logging
 import os
 import sys
 from typing import List
+import requests
 from sphinx.ext import autodoc
 logger = logging.getLogger(__name__)
@@ -34,6 +36,7 @@ author = 'the vLLM Team'
 extensions = [
    "sphinx.ext.napoleon",
    "sphinx.ext.viewcode",
+    "sphinx.ext.linkcode",
    "sphinx.ext.intersphinx",
    "sphinx_copybutton",
    "sphinx.ext.autodoc",
@@ -94,9 +97,71 @@ def setup(app):
    generate_examples()
+_cached_base: str = ""
+_cached_branch: str = ""
+def get_repo_base_and_branch(pr_number):
+    global _cached_base, _cached_branch
+    if _cached_base and _cached_branch:
+        return _cached_base, _cached_branch
+    url = f"https://api.github.com/repos/vllm-project/vllm/pulls/{pr_number}"
+    response = requests.get(url)
+    if response.status_code == 200:
+        data = response.json()
+        _cached_base = data['head']['repo']['full_name']
+        _cached_branch = data['head']['ref']
+        return _cached_base, _cached_branch
+    else:
+        logger.error("Failed to fetch PR details: %s", response)
+        return None, None
+def linkcode_resolve(domain, info):
+    if domain != 'py':
+        return None
+    if not info['module']:
+        return None
+    filename = info['module'].replace('.', '/')
+    module = info['module']
+    # try to determine the correct file and line number to link to
+    obj = sys.modules[module]
+    # get as specific as we can
+    lineno: int = 0
+    filename: str = ""
+    try:
+        for part in info['fullname'].split('.'):
+            obj = getattr(obj, part)
+            if not (inspect.isclass(obj) or inspect.isfunction(obj)
+                    or inspect.ismethod(obj)):
+                obj = obj.__class__  # Get the class of the instance
+            lineno = inspect.getsourcelines(obj)[1]
+            filename = (inspect.getsourcefile(obj)
+                        or f"{filename}.py").split("vllm/", 1)[1]
+    except Exception:
+        # For some things, like a class member, won't work, so
+        # we'll use the line number of the parent (the class)
+        pass
+    if filename.startswith("checkouts/"):
+        # a PR build on readthedocs
+        pr_number = filename.split("/")[1]
+        filename = filename.split("/", 2)[2]
+        base, branch = get_repo_base_and_branch(pr_number)
+        if base and branch:
+            return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
+    # Otherwise, link to the source file on the main branch
+    return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}"
 # Mock out external dependencies here, otherwise the autodoc pages may be blank.
 autodoc_mock_imports = [
-    "aiohttp",
    "compressed_tensors",
    "cpuinfo",
    "cv2",
@@ -113,10 +178,12 @@ autodoc_mock_imports = [
    "tensorizer",
    "pynvml",
    "outlines",
+    "xgrammar,"
    "librosa",
    "soundfile",
    "gguf",
    "lark",
+    "decord",
 ]
 for mock_target in autodoc_mock_imports:
@@ -143,6 +210,7 @@ intersphinx_mapping = {
    "python": ("https://docs.python.org/3", None),
    "typing_extensions":
    ("https://typing-extensions.readthedocs.io/en/latest", None),
+    "aiohttp": ("https://docs.aiohttp.org/en/stable", None),
    "pillow": ("https://pillow.readthedocs.io/en/stable", None),
    "numpy": ("https://numpy.org/doc/stable", None),
    "torch": ("https://pytorch.org/docs/stable", None),

--- a/docs/source/dev/dockerfile/dockerfile.rst
+++ b/docs/source/dev/dockerfile/dockerfile.rst
--- a/docs/source/contributing/overview.rst
+++ b/docs/source/contributing/overview.rst
+Contributing to vLLM
+=====================
+Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
+- Identify and report any issues or bugs.
+- Request or add support for a new model.
+- Suggest or implement new features.
+- Improve documentation or contribute a how-to guide.
+We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions.
+Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
+License
+-------
+See `LICENSE <https://github.com/vllm-project/vllm/tree/main/LICENSE>`_.
+Developing
+----------
+Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the `building from source <https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source>`_ documentation for details.
+Testing
+-------
+.. code-block:: bash
+    pip install -r requirements-dev.txt
+    # linting and formatting
+    bash format.sh
+    # Static type checking
+    mypy
+    # Unit tests
+    pytest tests/
+.. note:: Currently, the repository does not pass the ``mypy`` tests.
+Contribution Guidelines
+=======================
+Issues
+------
+If you encounter a bug or have a feature request, please `search existing issues <https://github.com/vllm-project/vllm/issues?q=is%3Aissue>`_ first to see if it has already been reported. If not, please `file a new issue <https://github.com/vllm-project/vllm/issues/new/choose>`_, providing as much relevant information as possible.
+.. important::
+   If you discover a security vulnerability, please follow the instructions `here <https://github.com/vllm-project/vllm/tree/main/SECURITY.md#reporting-a-vulnerability>`_.
+Pull Requests & Code Reviews
+----------------------------
+Thank you for your contribution to vLLM! Before submitting the pull request,
+please ensure the PR meets the following criteria. This helps vLLM maintain the
+code quality and improve the efficiency of the review process.
+DCO and Signed-off-by
+^^^^^^^^^^^^^^^^^^^^^
+When contributing changes to this project, you must agree to the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
+Commits must include a ``Signed-off-by:`` header which certifies agreement with
+the terms of the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
+Using ``-s`` with ``git commit`` will automatically add this header.
+PR Title and Classification
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Only specific types of PRs will be reviewed. The PR title is prefixed
+appropriately to indicate the type of change. Please use one of the following:
+- ``[Bugfix]`` for bug fixes.
+- ``[CI/Build]`` for build or continuous integration improvements.
+- ``[Doc]`` for documentation fixes and improvements.
+- ``[Model]`` for adding a new model or improving an existing model. Model name
+  should appear in the title.
+- ``[Frontend]`` For changes on the vLLM frontend (e.g., OpenAI API server,
+  ``LLM`` class, etc.)
+- ``[Kernel]`` for changes affecting CUDA kernels or other compute kernels.
+- ``[Core]`` for changes in the core vLLM logic (e.g., ``LLMEngine``,
+  ``AsyncLLMEngine``, ``Scheduler``, etc.)
+- ``[Hardware][Vendor]`` for hardware-specific changes. Vendor name should
+  appear in the prefix (e.g., ``[Hardware][AMD]``).
+- ``[Misc]`` for PRs that do not fit the above categories. Please use this
+  sparingly.
+.. note::
+   If the PR spans more than one category, please include all relevant prefixes.
+Code Quality
+^^^^^^^^^^^^
+The PR needs to meet the following code quality standards:
+- We adhere to `Google Python style guide
+  <https://google.github.io/styleguide/pyguide.html>`_ and `Google C++ style guide
+  <https://google.github.io/styleguide/cppguide.html>`_.
+- Pass all linter checks. Please use `format.sh
+  <https://github.com/vllm-project/vllm/blob/main/format.sh>`_ to format your
+  code.
+- The code needs to be well-documented to ensure future contributors can easily
+  understand the code.
+- Include sufficient tests to ensure the project stays correct and robust. This
+  includes both unit tests and integration tests.
+- Please add documentation to ``docs/source/`` if the PR modifies the
+  user-facing behaviors of vLLM. It helps vLLM users understand and utilize the
+  new features or changes.
+Adding or Changing Kernels
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.
+- Make sure custom ops are registered following PyTorch guidelines:
+  `Custom C++ and CUDA Operators <https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial>`_
+  and `The Custom Operators Manual <https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU>`_.
+- Custom operations that return ``Tensors`` require meta-functions.
+  Meta-functions should be implemented and registered in Python so that dynamic
+  dims can be handled automatically. See above documents for a description of
+  meta-functions.
+- Use `torch.library.opcheck() <https://pytorch.org/docs/stable/library.html#torch.library.opcheck>`_
+  to test the function registration and meta-function for any registered ops.
+  See ``tests/kernels`` for examples.
+- When changing the C++ signature of an existing op, the schema must be updated
+  to reflect the changes.
+- If a new custom type is needed, see the following document:
+  `Custom Class Support in PT2 <https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA>`_.
+Notes for Large Changes
+^^^^^^^^^^^^^^^^^^^^^^^
+Please keep the changes as concise as possible. For major architectural changes
+(>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue
+(RFC) discussing the technical design and justification. Otherwise, we will tag
+it with ``rfc-required`` and might not go through the PR.
+What to Expect for the Reviews
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The goal of the vLLM team is to be a *transparent reviewing machine*. We would
+like to make the review process transparent and efficient and make sure no
+contributor feels confused or frustrated. However, the vLLM team is small, so we
+need to prioritize some PRs over others. Here is what you can expect from the
+review process:
+- After the PR is submitted, the PR will be assigned to a reviewer. Every
+  reviewer will pick up the PRs based on their expertise and availability.
+- After the PR is assigned, the reviewer will provide status updates every 2-3
+  days. If the PR is not reviewed within 7 days, please feel free to ping the
+  reviewer or the vLLM team.
+- After the review, the reviewer will put an ``action-required`` label on the PR
+  if there are changes required. The contributor should address the comments and
+  ping the reviewer to re-review the PR.
+- Please respond to all comments within a reasonable time frame. If a comment
+  isn't clear or you disagree with a suggestion, feel free to ask for
+  clarification or discuss the suggestion.
+Thank You
+---------
+Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
+All of your contributions help make vLLM a great tool and community for everyone!
--- a/docs/source/dev/profiling/profiling_index.rst
+++ b/docs/source/dev/profiling/profiling_index.rst
-Profiling vLLM 
+==============
-=================================
+Profiling vLLM
+==============
 We support tracing vLLM workers using the ``torch.profiler`` module. You can enable tracing by setting the ``VLLM_TORCH_PROFILER_DIR`` environment variable to the directory where you want to save the traces: ``VLLM_TORCH_PROFILER_DIR=/mnt/traces/``

--- a/docs/source/design/arch_overview.rst
+++ b/docs/source/design/arch_overview.rst
+.. _arch_overview:
+Architecture Overview
+======================
+This document provides an overview of the vLLM architecture.
+.. contents:: Table of Contents
+    :local:
+    :depth: 2
+Entrypoints
+-----------
+vLLM provides a number of entrypoints for interacting with the system. The
+following diagram shows the relationship between them.
+.. image:: /assets/design/arch_overview/entrypoints.excalidraw.png
+    :alt: Entrypoints Diagram
+LLM Class
+^^^^^^^^^
+The LLM class provides the primary Python interface for doing offline inference,
+which is interacting with a model without using a separate model inference
+server.
+Here is a sample of `LLM` class usage:
+.. code-block:: python
+    from vllm import LLM, SamplingParams
+    # Define a list of input prompts
+    prompts = [
+        "Hello, my name is",
+        "The capital of France is",
+        "The largest ocean is",
+    ]
+    # Define sampling parameters
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    # Initialize the LLM engine with the OPT-125M model
+    llm = LLM(model="facebook/opt-125m")
+    # Generate outputs for the input prompts
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the generated outputs
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+More API details can be found in the :doc:`Offline Inference
+</dev/offline_inference/offline_index>` section of the API docs.
+The code for the `LLM` class can be found in `vllm/entrypoints/llm.py
+<https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py>`_.
+OpenAI-compatible API server
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The second primary interface to vLLM is via its OpenAI-compatible API server.
+This server can be started using the `vllm serve` command.
+.. code-block:: bash
+    vllm serve <model>
+The code for the `vllm` CLI can be found in `vllm/scripts.py
+<https://github.com/vllm-project/vllm/blob/main/vllm/scripts.py>`_.
+Sometimes you may see the API server entrypoint used directly instead of via the
+`vllm` CLI command. For example:
+.. code-block:: bash
+    python -m vllm.entrypoints.openai.api_server --model <model>
+That code can be found in `vllm/entrypoints/openai/api_server.py
+<https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py>`_.
+More details on the API server can be found in the :doc:`OpenAI Compatible
+Server </serving/openai_compatible_server>` document.
+LLM Engine
+----------
+The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
+the vLLM system, handling model inference and asynchronous request processing.
+.. image:: /assets/design/arch_overview/llm_engine.excalidraw.png
+    :alt: LLMEngine Diagram
+LLMEngine
+^^^^^^^^^
+The `LLMEngine` class is the core component of the vLLM engine. It is
+responsible for receiving requests from clients and generating outputs from the
+model. The `LLMEngine` includes input processing, model execution (possibly
+distributed across multiple hosts and/or GPUs), scheduling, and output
+processing.
+- **Input Processing**: Handles tokenization of input text using the specified
+  tokenizer.
+- **Scheduling**: Chooses which requests are processed in each step.
+- **Model Execution**: Manages the execution of the language model, including
+  distributed execution across multiple GPUs.
+- **Output Processing**: Processes the outputs generated by the model, decoding the
+  token IDs from a language model into human-readable text.
+The code for `LLMEngine` can be found in `vllm/engine/llm_engine.py`_.
+.. _vllm/engine/llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/llm_engine.py
+AsyncLLMEngine
+^^^^^^^^^^^^^^
+The `AsyncLLMEngine` class is an asynchronous wrapper for the `LLMEngine` class.
+It uses `asyncio` to create a background loop that continuously processes
+incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
+can handle multiple concurrent requests and stream outputs to clients.
+The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
+API server that serves as a simpler example in
+`vllm/entrypoints/api_server.py`_.
+.. _vllm/entrypoints/api_server.py: https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/api_server.py
+The code for `AsyncLLMEngine` can be found in `vllm/engine/async_llm_engine.py`_.
+.. _vllm/engine/async_llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/async_llm_engine.py
+Worker
+------
+A worker is a process that runs the model inference. vLLM follows the common
+practice of using one process to control one accelerator device, such as GPUs.
+For example, if we use tensor parallelism of size 2 and pipeline parallelism of
+size 2, we will have 4 workers in total. Workers are identified by their
+``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while
+``local_rank`` is mainly used for assigning the accelerator device and accessing
+local resources such as the file system and shared memory.
+Model Runner
+------------
+Every worker has one model runner object, responsible for loading and running
+the model. Much of the model execution logic resides here, such as preparing
+input tensors and capturing cudagraphs.
+Model
+-----
+Every model runner object has one model object, which is the actual
+``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various
+configurations affect the class we ultimately get.
+Class Hierarchy
+---------------
+The following figure shows the class hierarchy of vLLM:
+    .. figure:: /assets/design/hierarchy.png
+        :alt: query
+        :width: 100%
+        :align: center
+There are several important design choices behind this class hierarchy:
+1. **Extensibility**: All classes in the hierarchy accept a configuration object
+containing all the necessary information. The `VllmConfig
+<https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/config.py#L2036>`__
+class is the main configuration object that is passed around. The class
+hierarchy is quite deep, and every class needs to read the configuration it is
+interested in. By encapsulating all configurations in one object, we can easily
+pass the configuration object around and access the configuration we need.
+Suppose we want to add a new feature (this is often the case given how fast the
+field of LLM inference is evolving) that only touches the model runner. We will
+have to add a new configuration option in the `VllmConfig` class. Since we pass
+the whole config object around, we only need to add the configuration option to
+the `VllmConfig` class, and the model runner can access it directly. We don't
+need to change the constructor of the engine, worker, or model class to pass the
+new configuration option.
+2. **Uniformity**: The model runner needs a unified interface to create and
+initialize the model. vLLM supports more than 50 types of popular open-source
+models. Each model has its own initialization logic. If the constructor
+signature varies with models, the model runner does not know how to call the
+constructor accordingly, without complicated and error-prone inspection logic.
+By making the constructor of the model class uniform, the model runner can
+easily create and initialize the model without knowing the specific model type.
+This is also useful for composing models. Vision-language models often consist
+of a vision model and a language model. By making the constructor uniform, we
+can easily create a vision model and a language model and compose them into a
+vision-language model.
+.. note::
+    To support this change, all vLLM models' signatures have been updated to:
+    .. code-block:: python
+        def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
+    .. code-block:: python
+        class MyOldModel(nn.Module):
+            def __init__(
+                self,
+                config,
+                cache_config: Optional[CacheConfig] = None,
+                quant_config: Optional[QuantizationConfig] = None,
+                lora_config: Optional[LoRAConfig] = None,
+                prefix: str = "",
+            ) -> None:
+                ...
+        from vllm.config import VllmConfig
+        class MyNewModel(MyOldModel):
+            def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+                config = vllm_config.model_config.hf_config
+                cache_config = vllm_config.cache_config
+                quant_config = vllm_config.quant_config
+                lora_config = vllm_config.lora_config
+                super().__init__(config, cache_config, quant_config, lora_config, prefix)
+        if __version__ >= "0.6.4":
+            MyModel = MyNewModel
+        else:
+            MyModel = MyOldModel
+    This way, the model can work with both old and new versions of vLLM.
+3. **Sharding and Quantization at Initialization**: Certain features require
+changing the model weights. For example, tensor parallelism needs to shard the
+model weights, and quantization needs to quantize the model weights. There are
+two possible ways to implement this feature. One way is to change the model
+weights after the model is initialized. The other way is to change the model
+weights during the model initialization. vLLM chooses the latter. The first
+approach is not scalable to large models. Suppose we want to run a 405B model
+(with roughly 810GB weights) with 16 H100 80GB GPUs. Ideally, every GPU should
+only load 50GB weights. If we change the model weights after the model is
+initialized, we need to load the full 810GB weights to every GPU and then shard
+the weights, leading to a huge memory overhead. Instead, if we shard the weights
+during the model initialization, every layer will only create a shard of the
+weights it needs, leading to a much smaller memory overhead. The same idea
+applies to quantization. Note that we also add an additional argument ``prefix``
+to the model's constructor so that the model can initialize itself differently
+based on the prefix. This is useful for non-uniform quantization, where
+different parts of the model are quantized differently. The ``prefix`` is
+usually an empty string for the top-level model and a string like ``"vision"``
+or ``"language"`` for the sub-models. In general, it matches the name of the
+module's state dict in the checkpoint file.
+One disadvantage of this design is that it is hard to write unit tests for
+individual components in vLLM because every component needs to be initialized by
+a complete config object. We solve this problem by providing a default
+initialization function that creates a default config object with all fields set
+to ``None``. If the component we want to test only cares about a few fields in
+the config object, we can create a default config object and set the fields we
+care about. This way, we can test the component in isolation. Note that many
+tests in vLLM are end-to-end tests that test the whole system, so this is not a
+big problem.
+In summary, the complete config object ``VllmConfig`` can be treated as an
+engine-level global state that is shared among all vLLM classes.
--- a/docs/source/design/huggingface_integration.rst
+++ b/docs/source/design/huggingface_integration.rst
+.. _huggingface_integration:
+Integration with HuggingFace
+===================================
+This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run ``vllm serve``.
+Let's say we want to serve the popular QWen model by running ``vllm serve Qwen/Qwen2-7B``.
+1. The ``model`` argument is ``Qwen/Qwen2-7B``. vLLM determines whether this model exists by checking for the corresponding config file ``config.json``. See this `code snippet <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182>`__ for the implementation. Within this process:
+   - If the ``model`` argument corresponds to an existing local path, vLLM will load the config file directly from this path.
+   - If the ``model`` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the ``model`` argument as the model name and the ``--revision`` argument as the revision. See `their website <https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome>`__ for more information on how the HuggingFace cache works.
+   - If the ``model`` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to `this function <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91>`__ for the implementation. The input arguments include the ``model`` argument as the model name, the ``--revision`` argument as the revision, and the environment variable ``HF_TOKEN`` as the token to access the model hub. In our case, vLLM will download the `config.json <https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json>`__ file.
+2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this `code snippet <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186>`__ for the implementation.
+3. Next, vLLM `inspects <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189>`__ the ``model_type`` field in the config dictionary to `generate <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#190-L216>`__ the config object to use. There are some ``model_type`` values that vLLM directly supports; see `here <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48>`__ for the list. If the ``model_type`` is not in the list, vLLM will use `AutoConfig.from_pretrained <https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained>`__ to load the config class, with ``model``, ``--revision``, and ``--trust_remote_code`` as the arguments. Please note that:
+   - HuggingFace also has its own logic to determine the config class to use. It will again use the ``model_type`` field to search for the class name in the transformers library; see `here <https://github.com/huggingface/transformers/tree/main/src/transformers/models>`__ for the list of supported models. If the ``model_type`` is not found, HuggingFace will use the ``auto_map`` field from the config JSON file to determine the class name. Specifically, it is the ``AutoConfig`` field under ``auto_map``. See `DeepSeek <https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json>`__ for an example.
+   - The ``AutoConfig`` field under ``auto_map`` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the ``from_pretrained`` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when ``--trust_remote_code`` is enabled.
+4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see `here <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244>`__ for the implementation.
+5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the ``architectures`` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in `its registry <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80>`__. If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For ``Qwen/Qwen2-7B``, the ``architectures`` field is ``["Qwen2ForCausalLM"]``, which corresponds to the ``Qwen2ForCausalLM`` class in `vLLM's code <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364>`__. This class will initialize itself depending on various configs.
+Beyond that, there are two more things vLLM depends on HuggingFace for.
+1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using `AutoTokenizer.from_pretrained <https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained>`__ with the ``model`` argument as the model name and the ``--revision`` argument as the revision. It is also possible to use a tokenizer from another model by specifying the ``--tokenizer`` argument in the ``vllm serve`` command. Other relevant arguments are ``--tokenizer-revision`` and ``--tokenizer-mode``. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the `get_tokenizer <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87>`__ function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in `get_cached_tokenizer <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24>`__.
+2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the ``model`` argument as the model name and the ``--revision`` argument as the revision. vLLM provides the argument ``--load-format`` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass ``--load-format dummy`` to skip downloading the weights.
+   - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the `documentation <https://huggingface.co/docs/safetensors/en/index>`__ for more information on the safetensors format. This part of the logic can be found `here <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385>`__. Please note that:
+This completes the integration between vLLM and HuggingFace.
+In summary, vLLM reads the config file ``config.json``, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository.