Merge tag 'v0.8.2' into v0.8.2-dev

469e903b · zhuwenwen · 389ebcf7 · 25f560a6 · 469e903b · 469e903b
Commit 469e903b authored Mar 28, 2025 by zhuwenwen
20 changed files
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -65,12 +65,7 @@ struct cutlass_sparse_3x_gemm {
      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
                                float>::type;

-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
-
-  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;

  using ElementC = void;
  using LayoutC = cutlass::layout::RowMajor;

--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -58,7 +58,9 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,

  // Guard against compilation issues for sm90 kernels
 #if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
-  if (version_num >= 90) {
+  // We build for 9.0a which is not forward compatible, so restrict this to
+  // Hopper only
+  if (version_num == 90) {
    cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales,
                                  bias);
    return;
@@ -82,7 +84,9 @@ std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a) {

  // Guard against compilation issues for sm90 kernels
 #if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
-  if (version_num >= 90) {
+  // We build for 9.0a which is not forward compatible, so restrict this to
+  // Hopper only
+  if (version_num == 90) {
    std::vector<torch::Tensor> result_tensors;

    auto [a_meta, a_nzs] = cutlass_sparse_compress_sm90(a);

--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -4,6 +4,7 @@
 #include "core/registration.h"

 #include <torch/library.h>
+#include <torch/version.h>

 // Note on op signatures:
 // The X_meta signatures are for the meta functions corresponding to op X.
@@ -17,6 +18,15 @@

 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // vLLM custom ops
+  //
+
+  // The default behavior in PyTorch 2.6 is "requires_contiguous", so we need
+  // to override this for many GEMMs with the following tag. Otherwise,
+  // torch.compile will force all input tensors to be contiguous(), which
+  // will break many custom ops that require column-major weight matrices.
+  // TODO: remove this for PyTorch 2.8, when the default is planned to switch
+  // to match exact eager-mode strides.
+  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;

  ops.def("weak_ref_tensor(Tensor input) -> Tensor");
  ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
@@ -353,25 +363,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def(
      "aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
      "Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
-      "-> Tensor");
+      "-> Tensor",
+      {stride_tag});
  ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);

  // Decompression method for AQLM.
  ops.def(
      "aqlm_dequant(Tensor codes, Tensor codebooks, "
-      "int[] codebook_partition_sizes) -> Tensor");
+      "int[] codebook_partition_sizes) -> Tensor",
+      {stride_tag});
  ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);

  // Quantized GEMM for AWQ.
  ops.def(
      "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
-      "Tensor _zeros, SymInt split_k_iters) -> Tensor");
+      "Tensor _zeros, SymInt split_k_iters) -> Tensor",
+      {stride_tag});
  ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);

  // Dequantization for AWQ.
  ops.def(
      "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
-      "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor");
+      "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor",
+      {stride_tag});
  ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);

  // Note about marlin kernel 'workspace' arguments:
@@ -392,7 +406,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def(
      "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
      "Tensor! workspace, SymInt size_m, SymInt size_n, SymInt size_k) -> "
-      "Tensor");
+      "Tensor",
+      {stride_tag});
  // conditionally compiled so impl in source file

  // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
@@ -400,7 +415,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
      "Tensor b_scales, Tensor workspace, "
      "int b_q_type, "
-      "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor");
+      "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor",
+      {stride_tag});
  //  conditionally compiled so impl in source file

  // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
@@ -426,7 +442,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "   Tensor? channel_scales,"
      "   Tensor? token_scales,"
      "   str?    schedule"
-      ") -> Tensor");
+      ") -> Tensor",
+      {stride_tag});
  ops.def(
      "machete_prepack_B("
      "   Tensor B,"
@@ -445,7 +462,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
      "int b_q_type, "
      "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
-      "bool has_zp, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
+      "bool has_zp, bool use_atomic_add, bool use_fp32_reduce, "
+      "bool is_zp_float) -> Tensor",
+      {stride_tag});
  // conditionally compiled so impl registration is in source file

  // gptq_marlin repack from GPTQ.
@@ -476,12 +495,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "ggml_mul_mat_a8(Tensor W, Tensor X, int type, SymInt row) -> Tensor");
  ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);

+  // moe kernel for GGML.
+  ops.def(
+      "ggml_moe_a8(Tensor X, Tensor W, "
+      "Tensor sorted_token_ids, Tensor expert_ids, Tensor "
+      "num_tokens_post_padded, "
+      "int type, SymInt row, SymInt top_k, SymInt tokens) -> Tensor");
+  ops.impl("ggml_moe_a8", torch::kCUDA, &ggml_moe_a8);
+
+  ops.def("ggml_moe_get_block_size", &ggml_moe_get_block_size);
+
 #ifndef USE_ROCM
  // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
  ops.def(
      "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
      "Tensor! workspace, int num_bits, SymInt size_m, SymInt size_n, "
-      "SymInt size_k) -> Tensor");
+      "SymInt size_k) -> Tensor",
+      {stride_tag});
  // conditionally compiled so impl registration is in source file

  // marlin_qqq_gemm for QQQ.
@@ -489,15 +519,25 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
      "Tensor s_tok, Tensor s_ch, Tensor s_group, "
      "Tensor! workspace, SymInt size_m, SymInt size_n, "
-      "SymInt size_k) -> Tensor");
+      "SymInt size_k) -> Tensor",
+      {stride_tag});
  // conditionally compiled so impl registration is in source file

+  // CUTLASS nvfp4 block scaled GEMM
+  ops.def(
+      "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b,"
+      "                      Tensor block_scale_a, Tensor block_scale_b,"
+      "                      Tensor alpha) -> ()",
+      {stride_tag});
+  ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
+
  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
  // quantization, as well as bias
  ops.def(
      "cutlass_scaled_mm(Tensor! out, Tensor a,"
      "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor? bias) -> ()");
+      "                  Tensor b_scales, Tensor? bias) -> ()",
+      {stride_tag});
  ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);

  // CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
@@ -506,7 +546,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
      "                  Tensor b, Tensor a_scales,"
      "                  Tensor b_scales, Tensor azp_adj,"
-      "                  Tensor? azp, Tensor? bias) -> ()");
+      "                  Tensor? azp, Tensor? bias) -> ()",
+      {stride_tag});
  ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp);

  // Check if cutlass scaled_mm is supported for CUDA devices of the given
@@ -519,7 +560,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
      "bool");
  ops.impl("cutlass_scaled_mm_supports_block_fp8",
-           &cutlass_scaled_mm_supports_fp8);
+           &cutlass_scaled_mm_supports_block_fp8);

  // Check if cutlass sparse scaled_mm is supported for CUDA devices of the
  // given capability
@@ -534,7 +575,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
      "                         Tensor bt_nzs,"
      "                         Tensor bt_meta, Tensor a_scales,"
-      "                         Tensor b_scales, Tensor? bias) -> ()");
+      "                         Tensor b_scales, Tensor? bias) -> ()",
+      {stride_tag});
  ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);

  // CUTLASS sparse matrix compressor
@@ -582,15 +624,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                 Tensor! output_scale, Tensor input_scale) -> ()");
  ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);

+  // Check if cutlass_scaled_mm_fp4 is supported for CUDA devices
+  // of the given capability
+  ops.def("cutlass_scaled_mm_supports_fp4(int cuda_device_capability) -> bool");
+  ops.impl("cutlass_scaled_mm_supports_fp4", &cutlass_scaled_mm_supports_fp4);
 #endif

  // Quantized GEMM for GPTQ.
  // Note: even though the C++ inferred schema is correct for this op, it seems
  // to prevent the meta function registry.
+
 //   ops.def(
 //       "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
 //       "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
-//       "-> Tensor");
+//       "-> Tensor",
+//       {stride_tag});
 //   ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);

  // Post processing for GPTQ.
@@ -630,6 +678,25 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "Tensor!? azp) -> ()");
  ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
           &dynamic_scaled_int8_quant);
+
+#ifndef USE_ROCM
+  // reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
+  ops.def(
+      "rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "
+      "Tensor? b_zeros, "
+      "bool has_zp, Tensor! b_qweight_reorder, Tensor! b_scales_reorder, "
+      "Tensor!? b_zeros_reorder, "
+      "int K, int N, int N_32align) -> ()");
+  //  conditionally compiled so impl in source file
+
+  // AllSpark quantization ops
+  ops.def(
+      "allspark_w8a16_gemm(Tensor a, Tensor b_qweight, Tensor b_scales, "
+      "Tensor? b_qzeros, "
+      "SymInt n, SymInt group_size, SymInt sm_count, SymInt sm_version, SymInt "
+      "CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) -> Tensor");
+  //  conditionally compiled so impl in source file
+#endif
 }

 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
@@ -699,6 +766,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
      "str kv_cache_dtype) -> ()");
  cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
+
+  // Gather cache blocks from src_cache to dst.
+  cache_ops.def(
+      "gather_cache(Tensor src_cache, Tensor! dst, Tensor block_table, "
+      "Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
+  cache_ops.impl("gather_cache", torch::kCUDA, &gather_cache);
 }

 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
@@ -715,7 +788,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
                  &get_max_shared_memory_per_block_device_attribute);
 }

-#ifndef USE_ROCM
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
  // Custom all-reduce kernels
  custom_ar.def(
@@ -733,7 +805,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
  custom_ar.def("register_buffer", &register_buffer);
  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
  custom_ar.def("register_graph_buffers", &register_graph_buffers);
+  custom_ar.def("allocate_shared_buffer_and_handle",
+    &allocate_shared_buffer_and_handle);
+  custom_ar.def("open_mem_handle(Tensor mem_handle) -> int", &open_mem_handle);
+  custom_ar.impl("open_mem_handle", torch::kCPU, &open_mem_handle);
+
+  custom_ar.def("free_shared_buffer", &free_shared_buffer);
 }
-#endif

 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/docs/README.md
+++ b/docs/README.md
@@ -4,7 +4,7 @@

 ```bash
 # Install dependencies.
-pip install -r requirements-docs.txt
+pip install -r ../requirements/docs.txt

 # Build the docs.
 make clean

--- a/docs/source/assets/contributing/dockerfile-stages-dependency.png
+++ b/docs/source/assets/contributing/dockerfile-stages-dependency.png
--- a/docs/source/assets/design/v1/metrics/intervals-1.png
+++ b/docs/source/assets/design/v1/metrics/intervals-1.png
--- a/docs/source/assets/design/v1/metrics/intervals-2.png
+++ b/docs/source/assets/design/v1/metrics/intervals-2.png
--- a/docs/source/assets/design/v1/metrics/intervals-3.png
+++ b/docs/source/assets/design/v1/metrics/intervals-3.png
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@@ -4,6 +4,8 @@

 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:

+- [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
+- [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
 - [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
 - [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)
 - [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing)

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -17,7 +17,6 @@ import inspect
 import logging
 import os
 import sys
-from typing import List

 import requests
 from sphinx.ext import autodoc
@@ -58,7 +57,7 @@ templates_path = ['_templates']
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns: List[str] = ["**/*.template.md", "**/*.inc.md"]
+exclude_patterns: list[str] = ["**/*.template.md", "**/*.inc.md"]

 # Exclude the prompt "$" when copying code
 copybutton_prompt_text = r"\$ "
@@ -86,6 +85,7 @@ html_static_path = ["_static"]
 html_js_files = ["custom.js"]
 html_css_files = ["custom.css"]

+myst_heading_anchors = 2
 myst_url_schemes = {
    'http': None,
    'https': None,

--- a/docs/source/contributing/model/basic.md
+++ b/docs/source/contributing/model/basic.md
@@ -74,8 +74,6 @@ def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
-    kv_caches: List[torch.Tensor],
-    attn_metadata: AttentionMetadata,
 ) -> torch.Tensor:
    ...
 ```

--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -16,8 +16,6 @@ Further update the model as follows:
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
  +     pixel_values: torch.Tensor,
    ) -> SamplerOutput:
  ```
@@ -36,7 +34,8 @@ Further update the model as follows:
            image_features = self.vision_encoder(image_input)
            return self.multi_modal_projector(image_features)

-        def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]:
+        def get_multimodal_embeddings(
+                self, **kwargs: object) -> Optional[MultiModalEmbeddings]:

            # Validate the multimodal input keyword arguments
            image_input = self._parse_and_validate_image_input(**kwargs)
@@ -63,7 +62,7 @@ Further update the model as follows:
        def get_input_embeddings(
            self,
            input_ids: torch.Tensor,
-            multimodal_embeddings: Optional[NestedTensors] = None,
+            multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
        ) -> torch.Tensor:

            # `get_input_embeddings` should already be implemented for the language 
@@ -722,13 +721,13 @@ def _get_mm_fields_config(

 :::::

-### Prompt replacements
+### Prompt updates

-Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` to
-return a list of {class}`~vllm.multimodal.processing.PromptReplacement` instances.
+Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` to
+return a list of {class}`~vllm.multimodal.processing.PromptUpdate` instances.

-Each {class}`~vllm.multimodal.processing.PromptReplacement` instance specifies a find-and-replace
-operation performed by the HF processor.
+Each {class}`~vllm.multimodal.processing.PromptUpdate` instance specifies an update operation
+(e.g.: insertion, replacement) performed by the HF processor.

 ::::{tab-set}
 :::{tab-item} Basic example: LLaVA
@@ -745,15 +744,15 @@ for sample in text:
 ```

 It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
-Based on this, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` as follows:
+Based on this, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` as follows:

 ```python
-def _get_prompt_replacements(
+def _get_prompt_updates(
    self,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargs,
-) -> list[PromptReplacement]:
+) -> Sequence[PromptUpdate]:
    hf_config = self.info.get_hf_config()
    image_token_id = hf_config.image_token_index

@@ -861,7 +860,7 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
 )
 ```

-To accommodate this, instead of a string you can return an instance of `PromptReplacementDetails`
+To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
 with different `full` and `feature` attributes:

 ```python
@@ -880,7 +879,7 @@ def get_replacement_fuyu(item_idx: int):
    image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                    [_NEWLINE_TOKEN_ID]) * nrows

-    return PromptReplacementDetails(
+    return PromptUpdateDetails(
        full=image_tokens + [bos_token_id],
        features=image_tokens,
    )
@@ -890,12 +889,12 @@ Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the
 we can search for it to conduct the replacement at the start of the string:

 ```python
-def _get_prompt_replacements(
+def _get_prompt_updates(
    self,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargs,
-) -> list[PromptReplacement]:
+) -> Sequence[PromptUpdate]:
    hf_config = self.info.get_hf_config()
    bos_token_id = hf_config.bos_token_id
    assert isinstance(bos_token_id, int)
@@ -915,7 +914,7 @@ def _get_prompt_replacements(
        image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                        [_NEWLINE_TOKEN_ID]) * nrows

-        return PromptReplacementDetails(
+        return PromptUpdateDetails(
            full=image_tokens + [bos_token_id],
            features=image_tokens,
        )
@@ -950,3 +949,35 @@ to register them to the multi-modal registry:
 +                                         dummy_inputs=YourDummyInputsBuilder)
  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
 ```
+
+## Notes
+
+### Inserting feature tokens without replacement
+
+Some HF processors directly insert feature tokens without replacing anything in the original prompt. In that case, you can use {class}`~vllm.multimodal.processing.PromptInsertion` instead of {class}`~vllm.multimodal.processing.PromptReplacement` inside {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`.
+
+Examples:
+
+- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
+- Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py>
+- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
+
+### Handling prompt updates unrelated to multi-modal data
+
+{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only` so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design](#mm-processing).
+
+Examples:
+
+- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
+- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
+- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
+
+### Custom HF processor
+
+Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor`.
+
+Examples:
+
+- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
+- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
+- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -23,7 +23,7 @@ Check out the [building from source](#build-from-source) documentation for detai
 ## Testing

 ```bash
-pip install -r requirements-dev.txt
+pip install -r requirements/dev.txt

 # Linting, formatting and static type checking
 pre-commit install --hook-type pre-commit --hook-type commit-msg
@@ -145,6 +145,9 @@ review process:
 - Please respond to all comments within a reasonable time frame. If a comment
  isn't clear or you disagree with a suggestion, feel free to ask for
  clarification or discuss the suggestion.
+- Note that not all CI checks will be executed due to limited computational
+  resources. The reviewer will add `ready` label to the PR when the PR is
+  ready to merge or a full CI run is needed.

 ## Thank You


--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -4,6 +4,8 @@
 Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference.
 :::

+## Profile with PyTorch Profiler
+
 We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`

 The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
@@ -22,13 +24,13 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
 `export VLLM_RPC_TIMEOUT=1800000`
 :::

-## Example commands and usage
+### Example commands and usage

-### Offline Inference
+#### Offline Inference

 Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.

-### OpenAI Server
+#### OpenAI Server

 ```bash
 VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
@@ -39,3 +41,135 @@ benchmark_serving.py:
 ```bash
 python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2
 ```
+
+## Profile with NVIDIA Nsight Systems
+
+Nsight systems is an advanced tool that exposes more profiling details, such as register and shared memory usage, annotated code regions and low-level CUDA APIs and events.
+
+[Install nsight-systems](https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html) using your package manager.
+The following block is an example for Ubuntu.
+
+```bash
+apt update
+apt install -y --no-install-recommends gnupg
+echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list
+apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
+apt update
+apt install nsight-systems-cli
+```
+
+### Example commands and usage
+
+#### Offline Inference
+
+For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
+
+The following is an example using the `benchmarks/benchmark_latency.py` script:
+
+```bash
+nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node python benchmarks/benchmark_latency.py --model meta-llama/Llama-3.1-8B-Instruct --num-iters-warmup 5 --num-iters 1 --batch-size 16 --input-len 512 --output-len 8
+```
+
+#### OpenAI Server
+
+To profile the server, you will want to prepend your `vllm serve` command with `nsys profile` just like for offline inference, however you must specify `--delay XX --duration YY` parameters according to the needs of your benchmark. After the duration time has been used up, the server will be killed.
+
+```bash
+# server
+nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node --delay 30 --duration 60 vllm serve meta-llama/Llama-3.1-8B-Instruct
+
+# client
+python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 1 --dataset-name random --random-input 1024 --random-output 512
+```
+
+In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run:
+
+```
+nsys sessions list
+```
+
+to get the session id in the form of `profile-XXXXX`, then run:
+
+```
+nsys stop --session=profile-XXXXX
+```
+
+to manually kill the profiler and generate your `nsys-rep` report.
+
+#### Analysis
+
+You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started).
+
+CLI example:
+
+```bash
+nsys stats report1.nsys-rep
+...
+ ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
+
+ Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                                                  Name                                                
+ --------  ---------------  ---------  -----------  -----------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
+     46.3   10,327,352,338     17,505    589,965.9    144,383.0    27,040  3,126,460    944,263.8  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
+     14.8    3,305,114,764      5,152    641,520.7    293,408.0   287,296  2,822,716    867,124.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
+     12.1    2,692,284,876     14,280    188,535.4     83,904.0    19,328  2,862,237    497,999.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off…
+      9.5    2,116,600,578     33,920     62,399.8     21,504.0    15,326  2,532,285    290,954.1  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_…
+      5.0    1,119,749,165     18,912     59,208.4      9,056.0     6,784  2,578,366    271,581.7  void vllm::act_and_mul_kernel<c10::BFloat16, &vllm::silu_kernel<c10::BFloat16>, (bool)1>(T1 *, cons…
+      4.1      916,662,515     21,312     43,011.6     19,776.0     8,928  2,586,205    199,790.1  void cutlass::device_kernel<flash::enable_sm90_or_later<flash::FlashAttnFwdSm90<flash::CollectiveMa…
+      2.6      587,283,113     37,824     15,526.7      3,008.0     2,719  2,517,756    139,091.1  std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
+      1.9      418,362,605     18,912     22,121.5      3,871.0     3,328  2,523,870    175,248.2  void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
+      0.7      167,083,069     18,880      8,849.7      2,240.0     1,471  2,499,996    101,436.1  void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
+... 
+```
+
+GUI example:
+
+<img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
+
+## Profiling vLLM Python Code
+
+The Python standard library includes
+[cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
+code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
+Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be
+used to profile a section of code.
+
+### Example usage - decorator
+
+The first helper is a Python decorator that can be used to profile a function.
+If a filename is specified, the profile will be saved to that file. If no filename is
+specified, profile data will be printed to stdout.
+
+```python
+import vllm.utils
+
+@vllm.utils.cprofile("expensive_function.prof")
+def expensive_function():
+    # some expensive code
+    pass
+```
+
+### Example Usage - context manager
+
+The second helper is a context manager that can be used to profile a block of
+code. Similar to the decorator, the filename is optional.
+
+```python
+import vllm.utils
+
+def another_function():
+    # more expensive code
+    pass
+
+with vllm.utils.cprofile_context("another_function.prof"):
+    another_function()
+```
+
+### Analyzing Profile Results
+
+There are multiple tools available that can help analyze the profile results.
+One example is [snakeviz](https://jiffyclub.github.io/snakeviz/).
+
+```bash
+pip install snakeviz
+snakeviz expensive_function.prof
+```
--- a/docs/source/contributing/vulnerability_management.md
+++ b/docs/source/contributing/vulnerability_management.md
@@ -37,7 +37,7 @@ you may contact the following individuals:

 ## Slack Discussion

-You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
+You may use the `#security` channel in the [vLLM Slack](https://slack.vllm.ai)
 to discuss security-related topics. However, please do not disclose any
 vulnerabilities in this channel. If you need to report a vulnerability, please
 use the GitHub security advisory system or contact a VMT member privately.

--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -27,6 +27,36 @@ container to access the host's shared memory. vLLM uses PyTorch, which uses shar
 memory to share data between processes under the hood, particularly for tensor parallel inference.
 :::

+:::{note}
+Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
+
+If you need to use those dependencies (having accepted the license terms),
+create a custom Dockerfile on top of the base image with an extra layer that installs them:
+
+```Dockerfile
+FROM vllm/vllm-openai:v0.8.0
+
+# e.g. install the `audio` and `video` optional dependencies
+# NOTE: Make sure the version of vLLM matches the base image!
+RUN uv pip install vllm[audio,video]==0.8.0
+```
+
+:::
+
+:::{tip}
+Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers).
+
+To use the development version of `transformers`, create a custom Dockerfile on top of the base image
+with an extra layer that installs their code from source:
+
+```Dockerfile
+FROM vllm/vllm-openai:latest
+
+RUN uv pip install git+https://github.com/huggingface/transformers.git
+```
+
+:::
+
 (deployment-docker-build-image-from-source)=

 ## Building vLLM's Docker Image from Source

--- a/docs/source/deployment/frameworks/helm.md
+++ b/docs/source/deployment/frameworks/helm.md
@@ -4,9 +4,9 @@

 A Helm chart to deploy vLLM for Kubernetes

-Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLMm Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variables values.
+Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.

-This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file.
+This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file.

 ## Prerequisites


--- a/docs/source/deployment/frameworks/lws.md
+++ b/docs/source/deployment/frameworks/lws.md
@@ -7,5 +7,192 @@ A major use case is for multi-host/multi-node distributed inference.

 vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving.

-Please see [this guide](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm) for more details on
-deploying vLLM on Kubernetes using LWS.
+## Prerequisites
+
+* At least two Kubernetes nodes, each with 8 GPUs, are required.
+* Install LWS by following the instructions found [here](https://lws.sigs.k8s.io/docs/installation/).
+
+## Deploy and Serve
+
+Deploy the following yaml file `lws.yaml`
+
+```yaml
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: vllm
+spec:
+  replicas: 2
+  leaderWorkerTemplate:
+    size: 2
+    restartPolicy: RecreateGroupOnPodRestart
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+          - name: vllm-leader
+            image: docker.io/vllm/vllm-openai:latest
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                value: <your-hf-token>
+            command:
+              - sh
+              - -c
+              - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
+                 python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
+            resources:
+              limits:
+                nvidia.com/gpu: "8"
+                memory: 1124Gi
+                ephemeral-storage: 800Gi
+              requests:
+                ephemeral-storage: 800Gi
+                cpu: 125
+            ports:
+              - containerPort: 8080
+            readinessProbe:
+              tcpSocket:
+                port: 8080
+              initialDelaySeconds: 15
+              periodSeconds: 10
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+    workerTemplate:
+      spec:
+        containers:
+          - name: vllm-worker
+            image: docker.io/vllm/vllm-openai:latest
+            command:
+              - sh
+              - -c
+              - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
+            resources:
+              limits:
+                nvidia.com/gpu: "8"
+                memory: 1124Gi
+                ephemeral-storage: 800Gi
+              requests:
+                ephemeral-storage: 800Gi
+                cpu: 125
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                value: <your-hf-token>
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm   
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-leader
+spec:
+  ports:
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    leaderworkerset.sigs.k8s.io/name: vllm
+    role: leader
+  type: ClusterIP
+```
+
+```bash
+kubectl apply -f lws.yaml
+```
+
+Verify the status of the pods:
+
+```bash
+kubectl get pods
+```
+
+Should get an output similar to this:
+
+```bash
+NAME       READY   STATUS    RESTARTS   AGE
+vllm-0     1/1     Running   0          2s
+vllm-0-1   1/1     Running   0          2s
+vllm-1     1/1     Running   0          2s
+vllm-1-1   1/1     Running   0          2s
+```
+
+Verify that the distributed tensor-parallel inference works:
+
+```bash
+kubectl logs vllm-0 |grep -i "Loading model weights took" 
+```
+
+Should get something similar to this:
+
+```text
+INFO 05-08 03:20:24 model_runner.py:173] Loading model weights took 0.1189 GB
+(RayWorkerWrapper pid=169, ip=10.20.0.197) INFO 05-08 03:20:28 model_runner.py:173] Loading model weights took 0.1189 GB
+```
+
+## Access ClusterIP service
+
+```bash
+# Listen on port 8080 locally, forwarding to the targetPort of the service's port 8080 in a pod selected by the service
+kubectl port-forward svc/vllm-leader 8080:8080
+```
+
+The output should be similar to the following:
+
+```text
+Forwarding from 127.0.0.1:8080 -> 8080
+Forwarding from [::1]:8080 -> 8080
+```
+
+## Serve the model
+
+Open another terminal and send a request
+
+```text
+curl http://localhost:8080/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+    "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+    "prompt": "San Francisco is a",
+    "max_tokens": 7,
+    "temperature": 0
+}'
+```
+
+The output should be similar to the following
+
+```text
+{
+  "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
+  "object": "text_completion",
+  "created": 1715138766,
+  "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+  "choices": [
+    {
+      "index": 0,
+      "text": " top destination for foodies, with",
+      "logprobs": null,
+      "finish_reason": "length",
+      "stop_reason": null
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 5,
+    "total_tokens": 12,
+    "completion_tokens": 7
+  }
+}
+```
--- a/docs/source/deployment/integrations/index.md
+++ b/docs/source/deployment/integrations/index.md
@@ -6,4 +6,6 @@
 kserve
 kubeai
 llamastack
+llmaz
+production-stack
 :::
--- a/docs/source/deployment/integrations/llmaz.md
+++ b/docs/source/deployment/integrations/llmaz.md
+(deployment-llmaz)=
+
+# llmaz
+
+[llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend.
+
+Please refer to the [Quick Start](https://github.com/InftyAI/llmaz?tab=readme-ov-file#quick-start) for more details.