sync v0.18.0

eefa41c1 · zhuwenwen · 82155c76 · eefa41c1 · eefa41c1 · eefa41c1
Commit eefa41c1 authored Mar 24, 2026 by zhuwenwen
20 changed files
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -10,8 +10,6 @@ from transformers import AutoConfig
 from vllm.model_executor.layers.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
-    _moe_permute,
-    _moe_unpermute_and_reduce,
    moe_permute,
    moe_unpermute,
 )
@@ -41,7 +39,6 @@ def benchmark_permute(
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
    num_iters: int = 100,
-    use_customized_permute: bool = False,
 ) -> float:
    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
@@ -113,11 +110,9 @@ def benchmark_unpermute(
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
    num_iters: int = 100,
-    use_customized_permute: bool = False,
 ) -> float:
    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    output_hidden_states = torch.empty_like(hidden_states)
    if use_fp8_w8a8:
        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
    else:
@@ -151,36 +146,15 @@ def benchmark_unpermute(
        )
    def run(input: tuple):
-        if use_customized_permute:
+        (permuted_hidden_states, first_token_off, inv_perm_idx) = input
-            (
+        output = torch.empty_like(hidden_states)
-                permuted_hidden_states,
+        moe_unpermute(
-                first_token_off,
+            output,
-                inv_perm_idx,
+            permuted_hidden_states,
-                m_indices,
+            topk_weights,
-            ) = input
+            inv_perm_idx,
-            output = torch.empty_like(hidden_states)
+            first_token_off,
-            moe_unpermute(
+        )
-                output,
-                permuted_hidden_states,
-                topk_weights,
-                inv_perm_idx,
-                first_token_off,
-            )
-        else:
-            (
-                permuted_hidden_states,
-                a1q_scale,
-                sorted_token_ids,
-                expert_ids,
-                inv_perm,
-            ) = input
-            _moe_unpermute_and_reduce(
-                output_hidden_states,
-                permuted_hidden_states,
-                inv_perm,
-                topk_weights,
-                True,
-            )
    # JIT compilation & warmup
    input = prepare()
@@ -235,8 +209,7 @@ class BenchmarkWorker:
        dtype: torch.dtype,
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
-        use_customized_permute: bool = False,
+    ) -> tuple[float, float]:
-    ) -> tuple[dict[str, int], float]:
        set_random_seed(self.seed)
        permute_time = benchmark_permute(
@@ -248,7 +221,6 @@ class BenchmarkWorker:
            use_fp8_w8a8,
            use_int8_w8a16,
            num_iters=100,
-            use_customized_permute=use_customized_permute,
        )
        unpermute_time = benchmark_unpermute(
            num_tokens,
@@ -259,7 +231,6 @@ class BenchmarkWorker:
            use_fp8_w8a8,
            use_int8_w8a16,
            num_iters=100,
-            use_customized_permute=use_customized_permute,
        )
        return permute_time, unpermute_time
@@ -306,7 +277,6 @@ def main(args: argparse.Namespace):
    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
-    use_customized_permute = args.use_customized_permute
    if args.batch_size is None:
        batch_sizes = [
@@ -358,7 +328,6 @@ def main(args: argparse.Namespace):
                dtype,
                use_fp8_w8a8,
                use_int8_w8a16,
-                use_customized_permute,
            )
            for batch_size in batch_sizes
        ],
@@ -378,7 +347,6 @@ if __name__ == "__main__":
    parser.add_argument(
        "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
    )
-    parser.add_argument("--use-customized-permute", action="store_true")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--batch-size", type=int, required=False)
    parser.add_argument("--trust-remote-code", action="store_true")

--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -26,6 +26,12 @@
 typedef __hip_bfloat16 __nv_bfloat16;
 #endif
+#if defined(__gfx942__)
+constexpr float kFp8ScaleDivisor = 224.f;
+#else
+constexpr float kFp8ScaleDivisor = 448.f;
+#endif
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                 int64_t block_size_in_bytes,
                 const torch::Tensor& block_mapping) {
@@ -403,8 +409,7 @@ __global__ void concat_and_cache_ds_mla_kernel(
  }
  // Compute the scale for the tile
-  float tile_scale = max_abs / 448.f;
+  float tile_scale = fmaxf(max_abs / kFp8ScaleDivisor, FLT_MIN);
-  tile_scale = fmaxf(tile_scale, FLT_MIN);
  // The first lane of each half-warp writes the scale to kv_cache
  if ((lane_idx == 0) || (lane_idx == 16)) {
@@ -473,11 +478,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
 #endif
  }
-#if defined(__gfx942__)
+float scale = fmaxf(amax, 1e-4) / kFp8ScaleDivisor;
-  float scale = fmaxf(amax, 1e-4) / 224.0f;
-#else
-  float scale = fmaxf(amax, 1e-4) / 448.0f;
-#endif
  if (use_ue8m0) {
    scale = exp2f(ceilf(log2f(scale)));

--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -358,13 +358,14 @@ void onednn_scaled_mm(
    const std::optional<torch::Tensor>& azp,      // [M] or [1]
    const std::optional<torch::Tensor>& azp_adj,  // [M] or [1]
    const std::optional<torch::Tensor>& bias,     // [N]
-    int64_t handler) {
+    const torch::Tensor& handler_tensor) {
  CPU_KERNEL_GUARD_IN(onednn_scaled_mm)
  TORCH_CHECK(a.dim() == 2);
  TORCH_CHECK(a.is_contiguous());
  TORCH_CHECK(c.is_contiguous());
  W8A8MatMulPrimitiveHandler* ptr =
-      reinterpret_cast<W8A8MatMulPrimitiveHandler*>(handler);
+      reinterpret_cast<W8A8MatMulPrimitiveHandler*>(
+          handler_tensor.item<int64_t>());
  const int32_t* azp_ptr = nullptr;
  if (azp.has_value()) {
    azp_ptr = azp->data_ptr<int32_t>();
@@ -517,13 +518,14 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
 void onednn_mm(torch::Tensor& c,        // [M, OC], row-major
               const torch::Tensor& a,  // [M, IC], row-major
-               const std::optional<torch::Tensor>& bias, int64_t handler) {
+               const std::optional<torch::Tensor>& bias,
+               const torch::Tensor& handler_tensor) {
  CPU_KERNEL_GUARD_IN(onednn_mm)
  TORCH_CHECK(a.dim() == 2);
  TORCH_CHECK(a.stride(-1) == 1);
  TORCH_CHECK(c.stride(-1) == 1);
  MatMulPrimitiveHandler* ptr =
-      reinterpret_cast<MatMulPrimitiveHandler*>(handler);
+      reinterpret_cast<MatMulPrimitiveHandler*>(handler_tensor.item<int64_t>());
 // ACL matmuls expect contiguous source tensors
 #ifdef VLLM_USE_ACL
@@ -565,4 +567,4 @@ void onednn_mm(torch::Tensor& c,        // [M, OC], row-major
    ptr->execute(exec_args);
  });
 }
\ No newline at end of file
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -23,13 +23,14 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
                      const std::optional<torch::Tensor>& azp,
                      const std::optional<torch::Tensor>& azp_adj,
                      const std::optional<torch::Tensor>& bias,
-                      int64_t handler);
+                      const torch::Tensor& handler_tensor);
 int64_t create_onednn_mm_handler(const torch::Tensor& b,
                                 int64_t primitive_cache_size);
 void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
-               const std::optional<torch::Tensor>& bias, int64_t handler);
+               const std::optional<torch::Tensor>& bias,
+               const torch::Tensor& handler_tensor);
 bool is_onednn_acl_supported();
@@ -200,7 +201,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // oneDNN GEMM
  ops.def(
      "onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
-      "int handler) -> ()");
+      "Tensor handler_tensor) -> ()");
  ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
  // Check if oneDNN was built with ACL backend
@@ -216,7 +217,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // oneDNN scaled_mm for W8A8 with static per-tensor activation quantization
  ops.def(
      "onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, "
-      "Tensor? azp_adj, Tensor? bias, int handler) -> ()");
+      "Tensor? azp_adj, Tensor? bias, Tensor handler_tensor) -> ()");
  ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm);
  // Compute int8 quantized tensor for given scaling factor.
@@ -335,4 +336,4 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
\ No newline at end of file
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -58,6 +58,11 @@ You can tune the performance by adjusting `max_num_batched_tokens`:
 - For optimal throughput, we recommend setting `max_num_batched_tokens > 8192` especially for smaller models on large GPUs.
 - If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the V0 default scheduling policy (except that it still prioritizes decodes).
+!!! warning
+    When chunked prefill is disabled, `max_num_batched_tokens` must be greater than `max_model_len`.  
+    In that case, if `max_num_batched_tokens < max_model_len`, vLLM may crash at server start‑up.
 ```python
 from vllm import LLM

--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -71,7 +71,7 @@ class MyModel(nn.Module):
 ```python
 def forward(
    self,
-    input_ids: torch.Tensor,
+    input_ids: torch.Tensor | None,
    positions: torch.Tensor,
    intermediate_tensors: IntermediateTensors | None = None,
    inputs_embeds: torch.Tensor | None = None,

--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -43,28 +43,73 @@ Further update the model as follows:
                )
    ```
- Implement [embed_multimodal][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_multimodal] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
+- Remove the embedding part from the [forward][torch.nn.Module.forward] method:
+    - Move the multi-modal embedding to [embed_multimodal][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_multimodal].
+    - The text embedding and embedding merge are handled automatically by a default implementation of [embed_input_ids][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_input_ids]. It does not need to be overridden in most cases.
+    ```diff
+      def forward(
+          self,
+          input_ids: torch.Tensor | None,
+    -     pixel_values: torch.Tensor,
+          positions: torch.Tensor,
+          intermediate_tensors: IntermediateTensors | None = None,
+          inputs_embeds: torch.Tensor | None = None,
+      ) -> torch.Tensor:
+    -     if inputs_embeds is None:
+    -         inputs_embeds = self.get_input_embeddings()(input_ids)
+    -
+    -     if pixel_values is not None:
+    -         image_features = self.get_image_features(
+    -             pixel_values=pixel_values,
+    -         )
+    -         special_image_mask = self.get_placeholder_mask(
+    -             input_ids,
+    -             inputs_embeds=inputs_embeds,
+    -             image_features=image_features,
+    -         )
+    -         inputs_embeds = inputs_embeds.masked_scatter(
+    -             special_image_mask,
+    -             image_features,
+    -         )
+           hidden_states = self.language_model(
+               input_ids,
+               positions,
+               intermediate_tensors,
+               inputs_embeds=inputs_embeds,
+           )
+         ...
+    +  def embed_multimodal(
+    +      self,
+    +      pixel_values: torch.Tensor,
+    +  ) -> MultiModalEmbeddings | None:
+    +      return self.get_image_features(
+    +          pixel_values=pixel_values,
+    +      )
+    ```
-    ??? code
+    Below we provide a boilerplate of a typical implementation pattern of [embed_multimodal][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_multimodal], but feel free to adjust it to your own needs.
-        ```python
+    ```python
-            def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
+    def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
-                image_features = self.vision_encoder(image_input)
+        image_features = self.vision_encoder(image_input)
-                return self.multi_modal_projector(image_features)
+        return self.multi_modal_projector(image_features)
-            def embed_multimodal(
+    def embed_multimodal(
-                self,
+        self,
-                **kwargs: object,
+        **kwargs: object,
-            ) -> MultiModalEmbeddings | None:
+    ) -> MultiModalEmbeddings | None:
-                # Validate the multimodal input keyword arguments
+        # Validate the multimodal input keyword arguments
-                image_input = self._parse_and_validate_image_input(**kwargs)
+        image_input = self._parse_and_validate_image_input(**kwargs)
-                if image_input is None:
+        if image_input is None:
-                    return None
+            return None
-                # Run multimodal inputs through encoder and projector
+        # Run multimodal inputs through encoder and projector
-                vision_embeddings = self._process_image_input(image_input)
+        vision_embeddings = self._process_image_input(image_input)
-                return vision_embeddings
+        return vision_embeddings
-        ```
+    ```
 !!! important
    The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.

--- a/docs/design/lora_resolver_plugins.md
+++ b/docs/design/lora_resolver_plugins.md
@@ -10,7 +10,7 @@ receives a request for a LoRA adapter that hasn't been loaded yet, the resolver
 to locate and load the adapter from their configured storage locations. This enables:
 - **Dynamic LoRA Loading**: Load adapters on-demand without server restarts
- **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, but custom resolvers can be implemented to fetch from any source.
+- **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, while the built-in `hf_hub_resolver` will pull LoRA adapters from Huggingface Hub and proceed in an identical manner. In general, custom resolvers can be implemented to fetch from any source.
 - **Automatic Discovery**: Seamless integration with existing LoRA workflows
 - **Scalable Deployment**: Centralized adapter management across multiple vLLM instances

--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -160,10 +160,12 @@ Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adap
 You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds.
-You can either install existing plugins or implement your own. By default, vLLM comes with a [resolver plugin to load LoRA adapters from a local directory.](https://github.com/vllm-project/vllm/tree/main/vllm/plugins/lora_resolvers)
+You can either install existing plugins or implement your own. By default, vLLM comes with a [resolver plugin to load LoRA adapters from a local directory, as well as a resolver plugin to load LoRA adapters from repositories on Hugging Face Hub](https://github.com/vllm-project/vllm/tree/main/vllm/plugins/lora_resolvers)
-To enable this resolver, set `VLLM_ALLOW_RUNTIME_LORA_UPDATING` to True, set `VLLM_PLUGINS` to include `lora_filesystem_resolver`, and then set `VLLM_LORA_RESOLVER_CACHE_DIR` to a local directory. When vLLM receives a request using a LoRA adapter `foobar`,
+To enable either of these resolvers, you must `set VLLM_ALLOW_RUNTIME_LORA_UPDATING` to True.
-it will first look in the local directory for a directory `foobar`, and attempt to load the contents of that directory as a LoRA adapter. If successful, the request will complete as normal and
-that adapter will then be available for normal use on the server.
+- To leverage a local directory, set `VLLM_PLUGINS` to include `lora_filesystem_resolver` and set `VLLM_LORA_RESOLVER_CACHE_DIR` to a local directory. When vLLM receives a request using a LoRA adapter `foobar`,
+it will first look in the local directory for a directory `foobar`, and attempt to load the contents of that directory as a LoRA adapter. If successful, the request will complete as normal and that adapter will then be available for normal use on the server.
+- To leverage repositories on Hugging Face Hub, set `VLLM_PLUGINS` to include `lora_hf_hub_resolver` and set `VLLM_LORA_RESOLVER_HF_REPO_LIST` to a comma separated list of repository IDs on Hugging Face Hub. When vLLM receives a request for the LoRA adapter `my/repo/subpath`, it will download the adapter at the `subpath` of `my/repo` if it exists and contains an `adapter_config.json`, then build a request to the cached dir for the adapter, similar to the `lora_filesystem_resolver`. Please note that enabling remote downloads is insecure and not intended for use in production environments.
 Alternatively, follow these example steps to implement your own plugin:

--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -213,10 +213,19 @@ Support use case: Prefill with 'HND' and decode with 'NHD' with experimental con
 --kv-transfer-config '{..., "enable_permute_local_kv":"True"}'
 ```
+### Cross layers blocks
+By default, this feature is disabled. On attention backends that support this feature, each logical block is contiguous in physical memory. This reduces the number of buffers that need to be transferred.
+To enable this feature:
+```bash
+--kv-transfer-config '{..., "kv_connector_extra_config": {"enable_cross_layers_blocks": "True"}}'
+```
 ## Example Scripts/Code
 Refer to these example scripts in the vLLM repository:
 - [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
 - [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
 - [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py)
\ No newline at end of file
--- a/examples/offline_inference/qwen3_omni/only_thinker.py
+++ b/examples/offline_inference/qwen3_omni/only_thinker.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference
-with the correct prompt format on Qwen2.5-Omni (thinker only).
+with the correct prompt format on Qwen3-Omni (thinker only).
 """
 from typing import NamedTuple
@@ -112,23 +112,51 @@ def get_multi_audios_query() -> QueryResult:
    )
+def get_multi_images_query() -> QueryResult:
+    question = "What are the differences between these two images?"
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+        "<|vision_start|><|image_pad|><|vision_end|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": [
+                    convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB"),
+                    convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB"),
+                ],
+            },
+        },
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
 query_map = {
    "mixed_modalities": get_mixed_modalities_query,
    "use_audio_in_video": get_use_audio_in_video_query,
    "multi_audios": get_multi_audios_query,
+    "multi_images": get_multi_images_query,
 }
 def main(args):
-    model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+    model_name = args.model
    query_result = query_map[args.query_type]()
    llm = LLM(
        model=model_name,
-        max_model_len=12800,
+        max_model_len=args.max_model_len,
        max_num_seqs=5,
        limit_mm_per_prompt=query_result.limit_mm_per_prompt,
        seed=args.seed,
+        tensor_parallel_size=args.tensor_parallel_size,
+        gpu_memory_utilization=args.gpu_memory_utilization,
    )
    # We set temperature to 0.2 so that outputs can be different
@@ -161,6 +189,31 @@ def parse_args():
        default=0,
        help="Set the seed when initializing `vllm.LLM`.",
    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="Qwen/Qwen3-Omni-30B-A3B-Instruct",
+        help="Model name or path.",
+    )
+    parser.add_argument(
+        "--tensor-parallel-size",
+        "-tp",
+        type=int,
+        default=1,
+        help="Tensor parallel size for distributed inference.",
+    )
+    parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.9,
+        help="GPU memory utilization (0.0 to 1.0).",
+    )
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        default=12800,
+        help="Maximum model context length.",
+    )
    return parser.parse_args()

--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -609,6 +609,42 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
    )
+# GLM-OCR
+def run_glm_ocr(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "zai-org/GLM-OCR"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        mm_processor_kwargs={
+            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
+            "fps": 1,
+        },
+        limit_mm_per_prompt={modality: 1},
+        enforce_eager=True,
+    )
+    if modality == "image":
+        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+    elif modality == "video":
+        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+    prompts = [
+        (
+            "[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
+            f"{placeholder}"
+            f"{question}<|assistant|>assistant\n"
+        )
+        for question in questions
+    ]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 # H2OVL-Mississippi
 def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@@ -2022,6 +2058,32 @@ def run_step3(questions: list[str], modality: str) -> ModelRequestData:
    )
+# StepVL10B
+def run_step_vl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "stepfun-ai/Step3-VL-10B"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_batched_tokens=4096,
+        tensor_parallel_size=1,
+        trust_remote_code=True,
+        limit_mm_per_prompt={modality: 1},
+        reasoning_parser="deepseek_r1",
+    )
+    prompts = [
+        "<｜begin▁of▁sentence｜> You are a helpful assistant.<|BOT|>user\n "
+        f"<im_patch>{question} <|EOT|><|BOT|>assistant\n<think>\n"
+        for question in questions
+    ]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 # omni-research/Tarsier-7b
 def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@@ -2096,6 +2158,7 @@ model_example_map = {
    "glm4_1v": run_glm4_1v,
    "glm4_5v": run_glm4_5v,
    "glm4_5v_fp8": run_glm4_5v_fp8,
+    "glm_ocr": run_glm_ocr,
    "h2ovl_chat": run_h2ovl,
    "hunyuan_vl": run_hunyuan_vl,
    "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
@@ -2143,6 +2206,7 @@ model_example_map = {
    "skywork_chat": run_skyworkr1v,
    "smolvlm": run_smolvlm,
    "step3": run_step3,
+    "stepvl": run_step_vl,
    "tarsier": run_tarsier,
    "tarsier2": run_tarsier2,
 }
@@ -2150,6 +2214,7 @@ model_example_map = {
 MODELS_NEED_VIDEO_METADATA = [
    "glm4_1v",
+    "glm_ocr",
    "glm4_5v",
    "glm4_5v_fp8",
    "molmo2",
@@ -2481,4 +2546,4 @@ def main(args):
 if __name__ == "__main__":
    args = parse_args()
    main(args)
\ No newline at end of file
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1208,6 +1208,32 @@ def load_step3(question: str, image_urls: list[str]) -> ModelRequestData:
    )
+def load_step_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "stepfun-ai/Step3-VL-10B"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_batched_tokens=4096,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        hf_overrides={"vision_config": {"enable_patch": False}},
+        trust_remote_code=True,
+        reasoning_parser="deepseek_r1",
+    )
+    prompt = (
+        "<｜begin▁of▁sentence｜> You are a helpful assistant.<|BOT|>user\n "
+        f"{'<im_patch>' * len(image_urls)}{question}<|EOT|><|BOT|>"
+        "assistant\n<think>\n"
+    )
+    image_data = [fetch_image(url) for url in image_urls]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
 def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "omni-research/Tarsier-7b"
@@ -1437,6 +1463,7 @@ model_example_map = {
    "rvl": load_r_vl,
    "smolvlm": load_smolvlm,
    "step3": load_step3,
+    "stepvl": load_step_vl,
    "tarsier": load_tarsier,
    "tarsier2": load_tarsier2,
    "glm4_1v": load_glm4_1v,
@@ -1606,4 +1633,4 @@ def main(args: Namespace):
 if __name__ == "__main__":
    args = parse_args()
    main(args)
\ No newline at end of file
--- a/examples/others/logging_configuration.md
+++ b/examples/others/logging_configuration.md
@@ -157,6 +157,37 @@ VLLM_CONFIGURE_LOGGING=0 \
    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
+### Example 4: Disable access logs for health check endpoints
+In production environments, health check endpoints like `/health`, `/metrics`,
+and `/ping` are frequently called by load balancers and monitoring systems,
+generating a large volume of repetitive access logs. To reduce log noise while
+keeping logs for other endpoints, use the `--disable-access-log-for-endpoints`
+option.
+**Disable access logs for health and metrics endpoints:**
+```bash
+vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048 \
+    --disable-access-log-for-endpoints /health,/metrics,/ping
+```
+**Common endpoints to consider filtering:**
+| Endpoint   | Description            | Typical Caller                                       |
+| ---------- | ---------------------- | ---------------------------------------------------- |
+| `/health`  | Health check           | Kubernetes liveness/readiness probes, load balancers |
+| `/metrics` | Prometheus metrics     | Prometheus scraper (every 15-60s)                    |
+| `/ping`    | SageMaker health check | SageMaker infrastructure                             |
+| `/load`    | Server load metrics    | Custom monitoring                                    |
+**Notes:**
+- This option only affects uvicorn access logs, not vLLM application logs
+- Specify multiple endpoints by separating them with commas (no spaces)
+- The filter uses exact path matching, query parameters are ignored (e.g., `/health?verbose=true` matches `/health`)
+- If you need to completely disable all access logs, use `--disable-uvicorn-access-log` instead
 ## Additional resources
 - [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details)
\ No newline at end of file
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,6 +43,7 @@ vllm = "vllm.entrypoints.cli.main:main"
 [project.entry-points."vllm.general_plugins"]
 lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver"
+lora_hf_hub_resolver = "vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver"
 [tool.setuptools_scm]
 # no extra settings needed, presence enables setuptools-scm

--- a/tests/entrypoints/openai/chat_completion/test_serving_chat.py
+++ b/tests/entrypoints/openai/chat_completion/test_serving_chat.py
@@ -524,6 +524,7 @@ class MockModelConfig:
    tokenizer_revision = None
    multimodal_config = MultiModalConfig()
    hf_config = MockHFConfig()
+    hf_text_config = MockHFConfig()
    logits_processors: list[str] | None = None
    diff_sampling_param: dict | None = None
    allowed_local_media_path: str = ""
@@ -1928,4 +1929,4 @@ class TestCreateRemainingArgsDelta:
        assert tc.id == "call_nofunc"
        assert tc.type == "function"
        assert tc.function.name is None
        assert tc.function.arguments == '{"data": "value"}'
\ No newline at end of file
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -43,7 +43,6 @@ from .mk_objects import (
    TestMoEQuantConfig,
    expert_info,
    make_fused_experts,
-    make_prepare_finalize,
    prepare_finalize_info,
 )
 from .parallel_utils import ProcessGroupInfo
@@ -586,10 +585,12 @@ def make_modular_kernel(
        routing_method=RoutingMethodType.DeepSeekV3,
    )
-    # make modular kernel
+    prepare_finalize = maybe_make_prepare_finalize(
-    prepare_finalize = make_prepare_finalize(
+        moe=moe,
-        config.prepare_finalize_type, config.all2all_backend(), moe, quant_config
+        quant_config=quant_config,
+        allow_new_interface=True,
    )
+    assert prepare_finalize is not None
    fused_experts = make_fused_experts(
        config.fused_experts_type,
@@ -673,4 +674,4 @@ def run_modular_kernel(
    ):
        out = mk.apply(**mk_kwargs)
    return out
\ No newline at end of file
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -7,9 +7,6 @@ import torch
 # Fused experts and PrepareFinalize imports
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe import TritonExperts
-from vllm.model_executor.layers.fused_moe.all2all_utils import (
-    maybe_make_prepare_finalize,
-)
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
    BatchedDeepGemmExperts,
 )
@@ -435,23 +432,6 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
    ]
-def make_prepare_finalize(
-    prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
-    backend: str | None,
-    moe: FusedMoEConfig,
-    quant_config: FusedMoEQuantConfig,
-) -> mk.FusedMoEPrepareAndFinalize:
-    if backend != "naive" and backend is not None:
-        prepare_finalize = maybe_make_prepare_finalize(moe, quant_config)
-        assert prepare_finalize is not None
-        return prepare_finalize
-    elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
-        return create_flashinfer_prepare_finalize(
-            use_dp=moe.moe_parallel_config.dp_size > 1
-        )
-    else:
-        return MoEPrepareAndFinalizeNoEP()
 def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor:
    s = rank * num_local_experts
    e = s + num_local_experts
@@ -500,4 +480,4 @@ def make_fused_experts(
    torch.set_printoptions(threshold=1000, edgeitems=5, linewidth=80)
    return experts
\ No newline at end of file
--- a/tests/models/language/pooling/test_token_classification.py
+++ b/tests/models/language/pooling/test_token_classification.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+import numpy as np
 import pytest
 import torch
 from transformers import AutoModelForTokenClassification
@@ -40,7 +42,6 @@ def test_bert_like_models(
    model: str,
    dtype: str,
 ) -> None:
    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.token_classify(example_prompts)
@@ -73,6 +74,7 @@ def test_bert_like_models(
 @pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])
 @pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.flaky(reruns=3)
 @torch.inference_mode
 def test_modernbert_models(
    hf_runner,
@@ -81,6 +83,14 @@ def test_modernbert_models(
    model: str,
    dtype: str,
 ) -> None:
+    # NOTE: https://github.com/vllm-project/vllm/pull/32403
+    # `disham993/electrical-ner-ModernBERT-base` is a randomly initialized
+    # model, which can cause numerical precision variance and edge cases.
+    # We use @flaky(reruns=3) to mitigate intermittent failures.
+    print(
+        f"\n[NOTE] Testing {model} (randomly initialized weights) - "
+        "flaky tolerance enabled due to numerical precision variance."
+    )
    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.token_classify(example_prompts)
@@ -140,4 +150,4 @@ def test_auto_conversion(
    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
        hf_output = hf_output.detach().clone().cpu().float()
        vllm_output = vllm_output.detach().clone().cpu().float()
        assert torch.allclose(hf_output, vllm_output, atol=1e-2)
\ No newline at end of file
--- a/tests/models/multimodal/generation/test_vit_backend_functionality.py
+++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py
@@ -91,6 +91,19 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        "use_processor": True,
        "question": "What is the content of each image?",
    },
+    "glm_ocr": {
+        "model_name": "zai-org/GLM-OCR",
+        "interface": "llm_generate",
+        "max_model_len": 131072,
+        "max_num_seqs": 2,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_tokens": 256,
+            "stop_token_ids": None,
+        },
+        "use_processor": True,
+        "question": "Text Recognition:",
+    },
    "keye_vl": {
        "model_name": "Kwai-Keye/Keye-VL-8B-Preview",
        "interface": "llm_generate",
@@ -427,4 +440,4 @@ def test_vit_backend_functionality(
    elif config["interface"] == "llm_generate":
        run_llm_generate_test(config, mm_encoder_attn_backend, image_assets)
    else:
        raise ValueError(f"Unknown interface: {config['interface']}")
\ No newline at end of file