Merge tag 'v0.18.0' into v0.18.0-ori

3fb4b5fa · zhuwenwen · bcf25339 · 89138b21 · 3fb4b5fa · 3fb4b5fa
Commit 3fb4b5fa authored Mar 23, 2026 by zhuwenwen
20 changed files
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -100,8 +100,8 @@ bench_sweep_plot_pareto = auto_mock(
    "vllm.benchmarks.sweep.plot_pareto", "SweepPlotParetoArgs"
 )
 bench_sweep_serve = auto_mock("vllm.benchmarks.sweep.serve", "SweepServeArgs")
-bench_sweep_serve_sla = auto_mock(
+bench_sweep_serve_workload = auto_mock(
-    "vllm.benchmarks.sweep.serve_sla", "SweepServeSLAArgs"
+    "vllm.benchmarks.sweep.serve_workload", "SweepServeWorkloadArgs"
 )
 bench_throughput = auto_mock("vllm.benchmarks", "throughput")
 AsyncEngineArgs = auto_mock("vllm.engine.arg_utils", "AsyncEngineArgs")
@@ -229,7 +229,9 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
        "bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args),
        "bench_sweep_plot_pareto": create_parser(bench_sweep_plot_pareto.add_cli_args),
        "bench_sweep_serve": create_parser(bench_sweep_serve.add_cli_args),
-        "bench_sweep_serve_sla": create_parser(bench_sweep_serve_sla.add_cli_args),
+        "bench_sweep_serve_workload": create_parser(
+            bench_sweep_serve_workload.add_cli_args
+        ),
        "bench_throughput": create_parser(bench_throughput.add_cli_args),
    }

--- a/docs/mkdocs/hooks/generate_metrics.py
+++ b/docs/mkdocs/hooks/generate_metrics.py
@@ -22,6 +22,7 @@ METRIC_SOURCE_FILES = [
        "path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
        "output": "nixl_connector.inc.md",
    },
+    {"path": "vllm/v1/metrics/perf.py", "output": "perf.inc.md"},
 ]

--- a/docs/mkdocs/javascript/reo.js
+++ b/docs/mkdocs/javascript/reo.js
-// Reo.Dev documentation tracking
-// https://docs.reo.dev/integrations/tracking-beacon/install-javascript-for-documentation
-!function(){var e,t,n;e="d5c4337961ef0ac",t=function(){Reo.init({clientID:"d5c4337961ef0ac"})},(n=document.createElement("script")).src="https://static.reo.dev/"+e+"/reo.js",n.defer=!0,n.onload=t,document.head.appendChild(n)}();
--- a/docs/models/extensions/instanttensor.md
+++ b/docs/models/extensions/instanttensor.md
+# Loading Model Weights with InstantTensor
+InstantTensor accelerates loading Safetensors weights on CUDA devices through distributed loading, pipelined prefetching, and direct I/O. InstantTensor also supports GDS (GPUDirect Storage) when available.
+For more details, see the [InstantTensor GitHub repository](https://github.com/scitix/InstantTensor).
+## Installation
+```bash
+pip install instanttensor
+```
+## Use InstantTensor in vLLM
+Add `--load-format instanttensor` as a command-line argument.
+For example:
+```bash
+vllm serve Qwen/Qwen2.5-0.5B --load-format instanttensor
+```
+## Benchmarks
+| Model | GPU | Backend | Load Time (s) | Throughput (GB/s) | Speedup |
+| --- | ---: | --- | ---: | ---: | --- |
+| Qwen3-30B-A3B | 1*H200 | Safetensors | 57.4 | 1.1 | 1x |
+| Qwen3-30B-A3B | 1*H200 | InstantTensor | 1.77 | 35 | <span style="color: green">**32.4x**</span> |
+| DeepSeek-R1 | 8*H200 | Safetensors | 160 | 4.3 | 1x |
+| DeepSeek-R1 | 8*H200 | InstantTensor | 15.3 | 45 | <span style="color: green">**10.5x**</span> |
+For the full benchmark results, see <https://github.com/scitix/InstantTensor/blob/main/docs/benchmark.md>.
--- a/docs/models/extensions/runai_model_streamer.md
+++ b/docs/models/extensions/runai_model_streamer.md
@@ -31,6 +31,16 @@ vllm serve gs://core-llm/Llama-3-8b \
    --load-format runai_streamer
 ```
+To run model from Azure Blob Storage run:
+```bash
+AZURE_STORAGE_ACCOUNT_NAME=<account> \
+vllm serve az://<container>/<model-path> \
+    --load-format runai_streamer
+```
+Authentication uses `DefaultAzureCredential`, which supports `az login`, managed identity, environment variables (`AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, `AZURE_CLIENT_SECRET`), and other methods.
 To run model from a S3 compatible object store run:
 ```bash

--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -59,7 +59,7 @@ for output in outputs:
    By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
    However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
-A code example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
+A code example can be found here: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)
 ### `LLM.beam_search`
@@ -121,7 +121,7 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    ```
-A code example can be found here: [examples/offline_inference/basic/chat.py](../../examples/offline_inference/basic/chat.py)
+A code example can be found here: [examples/basic/offline_inference/chat.py](../../examples/basic/offline_inference/chat.py)
 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:

--- a/docs/models/hardware_supported_models/cpu.md
+++ b/docs/models/hardware_supported_models/cpu.md
@@ -2,32 +2,32 @@
 ## Validated Hardware
-| Hardware                                 |
+| Hardware |
-| ----------------------------------------- |
+| -------- |
-| [Intel® Xeon® 6 Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon.html)                   |
+| [Intel® Xeon® 6 Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon.html) |
-| [Intel® Xeon® 5 Processors](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/5th-gen-xeon-scalable-processors.html)              |
+| [Intel® Xeon® 5 Processors](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/5th-gen-xeon-scalable-processors.html) |
 ## Recommended Models
 ### Text-only Language Models
 | Model                                | Architecture                             | Supported |
-|--------------------------------------|-------------------------------------------|-----------|
+| ------------------------------------ | ---------------------------------------- | --------- |
-| meta-llama/Llama-3.1-8B-Instruct     | LlamaForCausalLM                          | ✅        |
+| meta-llama/Llama-3.1-8B-Instruct     | LlamaForCausalLM                         | ✅        |
-| meta-llama/Llama-3.2-3B-Instruct     | LlamaForCausalLM                          | ✅        |
+| meta-llama/Llama-3.2-3B-Instruct     | LlamaForCausalLM                         | ✅        |
-| ibm-granite/granite-3.2-2b-instruct  | GraniteForCausalLM                        | ✅        |
+| ibm-granite/granite-3.2-2b-instruct  | GraniteForCausalLM                       | ✅        |
-| Qwen/Qwen3-1.7B                      | Qwen3ForCausalLM                          | ✅        |
+| Qwen/Qwen3-1.7B                      | Qwen3ForCausalLM                         | ✅        |
-| Qwen/Qwen3-4B                        | Qwen3ForCausalLM                          | ✅        |
+| Qwen/Qwen3-4B                        | Qwen3ForCausalLM                         | ✅        |
-| Qwen/Qwen3-8B                        | Qwen3ForCausalLM                          | ✅        |
+| Qwen/Qwen3-8B                        | Qwen3ForCausalLM                         | ✅        |
-| zai-org/glm-4-9b-hf                  | GLMForCausalLM                            | ✅        |
+| zai-org/glm-4-9b-hf                  | GLMForCausalLM                           | ✅        |
-| google/gemma-7b                      | GemmaForCausalLM                          | ✅        |
+| google/gemma-7b                      | GemmaForCausalLM                         | ✅        |
 ### Multimodal Language Models
 | Model                                | Architecture                             | Supported |
-|--------------------------------------|-------------------------------------------|-----------|
+| ------------------------------------ | ---------------------------------------- | --------- |
-| Qwen/Qwen2.5-VL-7B-Instruct          | Qwen2VLForConditionalGeneration           | ✅        |
+| Qwen/Qwen2.5-VL-7B-Instruct          | Qwen2VLForConditionalGeneration          | ✅        |
-| openai/whisper-large-v3              | WhisperForConditionalGeneration           | ✅        |
+| openai/whisper-large-v3              | WhisperForConditionalGeneration          | ✅        |
 ✅ Runs and optimized.  
 🟨 Runs and correct but not optimized to green yet.  

--- a/docs/models/hardware_supported_models/xpu.md
+++ b/docs/models/hardware_supported_models/xpu.md
@@ -2,9 +2,9 @@
 ## Validated Hardware
-| Hardware                                 |
+| Hardware |
-| ----------------------------------------- |
+| -------- |
-| [Intel® Arc™ Pro B-Series Graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/workstations/b-series/overview.html)                   |
+| [Intel® Arc™ Pro B-Series Graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/workstations/b-series/overview.html) |
 ## Recommended Models
@@ -12,53 +12,53 @@
 | Model                                     | Architecture                                         | FP16 | Dynamic FP8 | MXFP4 |
 | ----------------------------------------- | ---------------------------------------------------- | ---- | ----------- | ----- |
-| openai/gpt-oss-20b                        | GPTForCausalLM                                       |      |             | ✅     |
+| openai/gpt-oss-20b                        | GPTForCausalLM                                       |      |             | ✅    |
-| openai/gpt-oss-120b                       | GPTForCausalLM                                       |      |             | ✅     |
+| openai/gpt-oss-120b                       | GPTForCausalLM                                       |      |             | ✅    |
-| deepseek-ai/DeepSeek-R1-Distill-Llama-8B  | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-R1-Distill-Llama-8B  | LlamaForCausalLM                                     | ✅   | ✅          |       |
-| deepseek-ai/DeepSeek-R1-Distill-Qwen-14B  | QwenForCausalLM                                      | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-R1-Distill-Qwen-14B  | QwenForCausalLM                                      | ✅   | ✅          |       |
-| deepseek-ai/DeepSeek-R1-Distill-Qwen-32B  | QwenForCausalLM                                      | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-R1-Distill-Qwen-32B  | QwenForCausalLM                                      | ✅   | ✅          |       |
-| deepseek-ai/DeepSeek-R1-Distill-Llama-70B | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-R1-Distill-Llama-70B | LlamaForCausalLM                                     | ✅   | ✅          |       |
-| Qwen/Qwen2.5-72B-Instruct                 | Qwen2ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen2.5-72B-Instruct                 | Qwen2ForCausalLM                                     | ✅   | ✅          |       |
-| Qwen/Qwen3-14B                            | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-14B                            | Qwen3ForCausalLM                                     | ✅   | ✅          |       |
-| Qwen/Qwen3-32B                            | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-32B                            | Qwen3ForCausalLM                                     | ✅   | ✅          |       |
-| Qwen/Qwen3-30B-A3B                        | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-30B-A3B                        | Qwen3ForCausalLM                                     | ✅   | ✅          |       |
-| Qwen/Qwen3-30B-A3B-GPTQ-Int4              | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-30B-A3B-GPTQ-Int4              | Qwen3ForCausalLM                                     | ✅   | ✅          |       |
-| Qwen/Qwen3-coder-30B-A3B-Instruct         | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-coder-30B-A3B-Instruct         | Qwen3ForCausalLM                                     | ✅   | ✅          |       |
-| Qwen/QwQ-32B                              | QwenForCausalLM                                      | ✅    | ✅           |       |
+| Qwen/QwQ-32B                              | QwenForCausalLM                                      | ✅   | ✅          |       |
-| deepseek-ai/DeepSeek-V2-Lite              | DeepSeekForCausalLM                                  | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-V2-Lite              | DeepSeekForCausalLM                                  | ✅   | ✅          |       |
-| meta-llama/Llama-3.1-8B-Instruct          | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| meta-llama/Llama-3.1-8B-Instruct          | LlamaForCausalLM                                     | ✅   | ✅          |       |
-| baichuan-inc/Baichuan2-13B-Chat           | BaichuanForCausalLM                                  | ✅    | ✅           |       |
+| baichuan-inc/Baichuan2-13B-Chat           | BaichuanForCausalLM                                  | ✅   | ✅          |       |
-| THUDM/GLM-4-9B-chat                       | GLMForCausalLM                                       | ✅    | ✅           |       |
+| THUDM/GLM-4-9B-chat                       | GLMForCausalLM                                       | ✅   | ✅          |       |
-| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
+| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅   | ✅          |       |
-| chuhac/TeleChat2-35B                      | LlamaForCausalLM (TeleChat2 based on Llama arch)     | ✅    | ✅           |       |
+| chuhac/TeleChat2-35B                      | LlamaForCausalLM (TeleChat2 based on Llama arch)     | ✅   | ✅          |       |
-| 01-ai/Yi1.5-34B-Chat                      | YiForCausalLM                                        | ✅    | ✅           |       |
+| 01-ai/Yi1.5-34B-Chat                      | YiForCausalLM                                        | ✅   | ✅          |       |
-| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
+| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅   | ✅          |       |
-| deepseek-ai/DeepSeek-Coder-33B-base       | DeepSeekCoderForCausalLM                             | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-Coder-33B-base       | DeepSeekCoderForCausalLM                             | ✅   | ✅          |       |
-| baichuan-inc/Baichuan2-13B-Chat           | BaichuanForCausalLM                                  | ✅    | ✅           |       |
+| baichuan-inc/Baichuan2-13B-Chat           | BaichuanForCausalLM                                  | ✅   | ✅          |       |
-| meta-llama/Llama-2-13b-chat-hf            | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| meta-llama/Llama-2-13b-chat-hf            | LlamaForCausalLM                                     | ✅   | ✅          |       |
-| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
+| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅   | ✅          |       |
-| Qwen/Qwen1.5-14B-Chat                     | QwenForCausalLM                                      | ✅    | ✅           |       |
+| Qwen/Qwen1.5-14B-Chat                     | QwenForCausalLM                                      | ✅   | ✅          |       |
-| Qwen/Qwen1.5-32B-Chat                     | QwenForCausalLM                                      | ✅    | ✅           |       |
+| Qwen/Qwen1.5-32B-Chat                     | QwenForCausalLM                                      | ✅   | ✅          |       |
 ### Multimodal Language Models
 | Model                        | Architecture                     | FP16 | Dynamic FP8 | MXFP4 |
 | ---------------------------- | -------------------------------- | ---- | ----------- | ----- |
-| OpenGVLab/InternVL3_5-8B     | InternVLForConditionalGeneration | ✅    | ✅           |       |
+| OpenGVLab/InternVL3_5-8B     | InternVLForConditionalGeneration | ✅   | ✅          |       |
-| OpenGVLab/InternVL3_5-14B    | InternVLForConditionalGeneration | ✅    | ✅           |       |
+| OpenGVLab/InternVL3_5-14B    | InternVLForConditionalGeneration | ✅   | ✅          |       |
-| OpenGVLab/InternVL3_5-38B    | InternVLForConditionalGeneration | ✅    | ✅           |       |
+| OpenGVLab/InternVL3_5-38B    | InternVLForConditionalGeneration | ✅   | ✅          |       |
-| Qwen/Qwen2-VL-7B-Instruct    | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
+| Qwen/Qwen2-VL-7B-Instruct    | Qwen2VLForConditionalGeneration  | ✅   | ✅          |       |
-| Qwen/Qwen2.5-VL-72B-Instruct | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
+| Qwen/Qwen2.5-VL-72B-Instruct | Qwen2VLForConditionalGeneration  | ✅   | ✅          |       |
-| Qwen/Qwen2.5-VL-32B-Instruct | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
+| Qwen/Qwen2.5-VL-32B-Instruct | Qwen2VLForConditionalGeneration  | ✅   | ✅          |       |
-| THUDM/GLM-4v-9B              | GLM4vForConditionalGeneration    | ✅    | ✅           |       |
+| THUDM/GLM-4v-9B              | GLM4vForConditionalGeneration    | ✅   | ✅          |       |
-| openbmb/MiniCPM-V-4          | MiniCPMVForConditionalGeneration | ✅    | ✅           |       |
+| openbmb/MiniCPM-V-4          | MiniCPMVForConditionalGeneration | ✅   | ✅          |       |
 ### Embedding and Reranker Language Models
 | Model                   | Architecture                   | FP16 | Dynamic FP8 | MXFP4 |
 | ----------------------- | ------------------------------ | ---- | ----------- | ----- |
-| Qwen/Qwen3-Embedding-8B | Qwen3ForTextEmbedding          | ✅    | ✅           |       |
+| Qwen/Qwen3-Embedding-8B | Qwen3ForTextEmbedding          | ✅   | ✅          |       |
-| Qwen/Qwen3-Reranker-8B  | Qwen3ForSequenceClassification | ✅    | ✅           |       |
+| Qwen/Qwen3-Reranker-8B  | Qwen3ForSequenceClassification | ✅   | ✅          |       |
 ✅ Runs and optimized.  
 🟨 Runs and correct but not optimized to green yet.  

--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -31,7 +31,7 @@ vLLM will attempt to automatically convert the model according to the architectu
 shown in the table below.
 | Architecture                                    | `--convert` | Supported pooling tasks               |
-|-------------------------------------------------|-------------|---------------------------------------|
+| ----------------------------------------------- | ----------- | ------------------------------------- |
 | `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed`     | `token_embed`, `embed`                |
 | `*ForRewardModeling`, `*RewardModel`            | `embed`     | `token_embed`, `embed`                |
 | `*For*Classification`, `*ClassificationModel`   | `classify`  | `token_classify`, `classify`, `score` |
@@ -46,7 +46,7 @@ Each pooling model in vLLM supports one or more of these tasks according to
 enabling the corresponding APIs:
 | Task             | APIs                                                                          |
-|------------------|-------------------------------------------------------------------------------|
+| ---------------- | ----------------------------------------------------------------------------- |
 | `embed`          | `LLM.embed(...)`, `LLM.score(...)`\*, `LLM.encode(..., pooling_task="embed")` |
 | `classify`       | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")`               |
 | `score`          | `LLM.score(...)`                                                              |
@@ -69,7 +69,7 @@ If the model has been converted via `--convert` (see above),
 the pooler assigned to each task has the following attributes by default:
 | Task       | Pooling Type | Normalization | Softmax |
-|------------|--------------|---------------|---------|
+| ---------- | ------------ | ------------- | ------- |
 | `embed`    | `LAST`       | ✅︎            | ❌      |
 | `classify` | `LAST`       | ❌            | ✅︎      |
@@ -99,7 +99,7 @@ embeds = output.outputs.embedding
 print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```
-A code example can be found here: [examples/offline_inference/basic/embed.py](../../examples/offline_inference/basic/embed.py)
+A code example can be found here: [examples/basic/offline_inference/embed.py](../../examples/basic/offline_inference/embed.py)
 ### `LLM.classify`
@@ -116,7 +116,7 @@ probs = output.outputs.probs
 print(f"Class Probabilities: {probs!r} (size={len(probs)})")
 ```
-A code example can be found here: [examples/offline_inference/basic/classify.py](../../examples/offline_inference/basic/classify.py)
+A code example can be found here: [examples/basic/offline_inference/classify.py](../../examples/basic/offline_inference/classify.py)
 ### `LLM.score`
@@ -140,7 +140,7 @@ score = output.outputs.score
 print(f"Score: {score}")
 ```
-A code example can be found here: [examples/offline_inference/basic/score.py](../../examples/offline_inference/basic/score.py)
+A code example can be found here: [examples/basic/offline_inference/score.py](../../examples/basic/offline_inference/score.py)
 ### `LLM.reward`
@@ -156,7 +156,7 @@ data = output.outputs.data
 print(f"Data: {data!r}")
 ```
-A code example can be found here: [examples/offline_inference/basic/reward.py](../../examples/offline_inference/basic/reward.py)
+A code example can be found here: [examples/basic/offline_inference/reward.py](../../examples/basic/offline_inference/reward.py)
 ### `LLM.encode`
@@ -311,20 +311,31 @@ An OpenAI client example can be found here: [examples/pooling/embed/openai_embed
 [ColBERT](https://arxiv.org/abs/2004.12832) (Contextualized Late Interaction over BERT) is a retrieval model that uses per-token embeddings and MaxSim scoring for document ranking. Unlike single-vector embedding models, ColBERT retains token-level representations and computes relevance scores through late interaction, providing better accuracy while being more efficient than cross-encoders.
-vLLM supports ColBERT models for reranking tasks, automatically applying MaxSim scoring for query-document relevance:
+vLLM supports ColBERT models with multiple encoder backbones:
+| Architecture | Backbone | Example HF Models |
+| - | - | - |
+| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` |
+| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` |
+| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` |
+**BERT-based ColBERT** models work out of the box:
 ```shell
 vllm serve answerdotai/answerai-colbert-small-v1
 ```
-Currently supports ColBERT models with standard BERT encoders (e.g., `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0`).
+For **non-BERT backbones**, use `--hf-overrides` to set the correct architecture:
-ColBERT models with modified encoder architectures are not yet supported, including BERT variants with rotary embeddings (e.g., `jinaai/jina-colbert-v2`) or other custom encoders (e.g., `LiquidAI/LFM2-ColBERT-350M`).
-If your standard BERT ColBERT model's config doesn't specify the architecture as `HF_ColBERT`, override it with:
 ```shell
-vllm serve your-colbert-model --hf-overrides '{"architectures": ["HF_ColBERT"]}'
+# ModernBERT backbone
+vllm serve lightonai/GTE-ModernColBERT-v1 \
+    --hf-overrides '{"architectures": ["ColBERTModernBertModel"]}'
+# Jina XLM-RoBERTa backbone
+vllm serve jinaai/jina-colbert-v2 \
+    --hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \
+    --trust-remote-code
 ```
 Then you can use the rerank endpoint:
@@ -363,6 +374,257 @@ curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
 An example can be found here: [examples/pooling/score/colbert_rerank_online.py](../../examples/pooling/score/colbert_rerank_online.py)
+### ColQwen3 Multi-Modal Late Interaction Models
+ColQwen3 is based on [ColPali](https://arxiv.org/abs/2407.01449), which extends ColBERT's late interaction approach to **multi-modal** inputs. While ColBERT operates on text-only token embeddings, ColPali/ColQwen3 can embed both **text and images** (e.g. PDF pages, screenshots, diagrams) into per-token L2-normalized vectors and compute relevance via MaxSim scoring. ColQwen3 specifically uses Qwen3-VL as its vision-language backbone.
+| Architecture | Backbone | Example HF Models |
+| - | - | - |
+| `ColQwen3` | Qwen3-VL | `TomoroAI/tomoro-colqwen3-embed-4b`, `TomoroAI/tomoro-colqwen3-embed-8b` |
+| `OpsColQwen3Model` | Qwen3-VL | `OpenSearch-AI/Ops-Colqwen3-4B`, `OpenSearch-AI/Ops-Colqwen3-8B` |
+| `Qwen3VLNemotronEmbedModel` | Qwen3-VL | `nvidia/nemotron-colembed-vl-4b-v2`, `nvidia/nemotron-colembed-vl-8b-v2` |
+Start the server:
+```shell
+vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 4096
+```
+#### Text-only scoring and reranking
+Use the `/rerank` endpoint:
+```shell
+curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
+    "model": "TomoroAI/tomoro-colqwen3-embed-4b",
+    "query": "What is machine learning?",
+    "documents": [
+        "Machine learning is a subset of artificial intelligence.",
+        "Python is a programming language.",
+        "Deep learning uses neural networks."
+    ]
+}'
+```
+Or the `/score` endpoint:
+```shell
+curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
+    "model": "TomoroAI/tomoro-colqwen3-embed-4b",
+    "text_1": "What is the capital of France?",
+    "text_2": ["The capital of France is Paris.", "Python is a programming language."]
+}'
+```
+#### Multi-modal scoring and reranking (text query × image documents)
+The `/score` and `/rerank` endpoints also accept multi-modal inputs directly.
+Pass image documents using the `data_1`/`data_2` (for `/score`) or `documents` (for `/rerank`) fields
+with a `content` list containing `image_url` and `text` parts — the same format used by the
+OpenAI chat completion API:
+Score a text query against image documents:
+```shell
+curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
+    "model": "TomoroAI/tomoro-colqwen3-embed-4b",
+    "data_1": "Retrieve the city of Beijing",
+    "data_2": [
+        {
+            "content": [
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
+                {"type": "text", "text": "Describe the image."}
+            ]
+        }
+    ]
+}'
+```
+Rerank image documents by a text query:
+```shell
+curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
+    "model": "TomoroAI/tomoro-colqwen3-embed-4b",
+    "query": "Retrieve the city of Beijing",
+    "documents": [
+        {
+            "content": [
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_1>"}},
+                {"type": "text", "text": "Describe the image."}
+            ]
+        },
+        {
+            "content": [
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_2>"}},
+                {"type": "text", "text": "Describe the image."}
+            ]
+        }
+    ],
+    "top_n": 2
+}'
+```
+#### Raw token embeddings
+You can also get the raw token embeddings using the `/pooling` endpoint with `token_embed` task:
+```shell
+curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
+    "model": "TomoroAI/tomoro-colqwen3-embed-4b",
+    "input": "What is machine learning?",
+    "task": "token_embed"
+}'
+```
+For **image inputs** via the pooling endpoint, use the chat-style `messages` field:
+```shell
+curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
+    "model": "TomoroAI/tomoro-colqwen3-embed-4b",
+    "messages": [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
+                {"type": "text", "text": "Describe the image."}
+            ]
+        }
+    ]
+}'
+```
+#### Examples
+- Multi-vector retrieval: [examples/pooling/token_embed/colqwen3_token_embed_online.py](../../examples/pooling/token_embed/colqwen3_token_embed_online.py)
+- Reranking (text + multi-modal): [examples/pooling/score/colqwen3_rerank_online.py](../../examples/pooling/score/colqwen3_rerank_online.py)
+### Llama Nemotron Multimodal
+#### Embedding Model
+Llama Nemotron VL Embedding models combine the bidirectional Llama embedding backbone
+(from `nvidia/llama-nemotron-embed-1b-v2`) with SigLIP as the vision encoder to produce
+single-vector embeddings from text and/or images.
+| Architecture | Backbone | Example HF Models |
+| - | - | - |
+| `LlamaNemotronVLModel` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-embed-vl-1b-v2` |
+Start the server:
+```shell
+vllm serve nvidia/llama-nemotron-embed-vl-1b-v2 \
+    --trust-remote-code \
+    --chat-template examples/pooling/embed/template/nemotron_embed_vl.jinja
+```
+!!! note
+    The chat template bundled with this model's tokenizer is not suitable for
+    the embeddings API. Use the provided override template above when serving
+    with the `messages`-based (chat-style) embeddings endpoint.
+    The override template uses the message `role` to automatically prepend the
+    appropriate prefix: set `role` to `"query"` for queries (prepends `query: `)
+    or `"document"` for passages (prepends `passage: `). Any other role omits
+    the prefix.
+Embed text queries:
+```shell
+curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{
+    "model": "nvidia/llama-nemotron-embed-vl-1b-v2",
+    "messages": [
+        {
+            "role": "query",
+            "content": [
+                {"type": "text", "text": "What is machine learning?"}
+            ]
+        }
+    ]
+}'
+```
+Embed images via the chat-style `messages` field:
+```shell
+curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{
+    "model": "nvidia/llama-nemotron-embed-vl-1b-v2",
+    "messages": [
+        {
+            "role": "document",
+            "content": [
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
+                {"type": "text", "text": "Describe the image."}
+            ]
+        }
+    ]
+}'
+```
+#### Reranker Model
+Llama Nemotron VL reranker models combine the same bidirectional Llama + SigLIP
+backbone with a sequence-classification head for cross-encoder scoring and reranking.
+| Architecture | Backbone | Example HF Models |
+| - | - | - |
+| `LlamaNemotronVLForSequenceClassification` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-rerank-vl-1b-v2` |
+Start the server:
+```shell
+vllm serve nvidia/llama-nemotron-rerank-vl-1b-v2 \
+    --runner pooling \
+    --trust-remote-code \
+    --chat-template examples/pooling/score/template/nemotron-vl-rerank.jinja
+```
+!!! note
+    The chat template bundled with this checkpoint's tokenizer is not suitable
+    for the Score/Rerank APIs. Use the provided override template when serving:
+    `examples/pooling/score/template/nemotron-vl-rerank.jinja`.
+Score a text query against an image document:
+```shell
+curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
+    "model": "nvidia/llama-nemotron-rerank-vl-1b-v2",
+    "data_1": "Find diagrams about autonomous robots",
+    "data_2": [
+        {
+            "content": [
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
+                {"type": "text", "text": "Robotics workflow diagram."}
+            ]
+        }
+    ]
+}'
+```
+Rerank image documents by a text query:
+```shell
+curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
+    "model": "nvidia/llama-nemotron-rerank-vl-1b-v2",
+    "query": "Find diagrams about autonomous robots",
+    "documents": [
+        {
+            "content": [
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_1>"}},
+                {"type": "text", "text": "Robotics workflow diagram."}
+            ]
+        },
+        {
+            "content": [
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_2>"}},
+                {"type": "text", "text": "General skyline photo."}
+            ]
+        }
+    ],
+    "top_n": 2
+}'
+```
 ### BAAI/bge-m3
 The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings but unfortunately in its `config.json`
@@ -379,7 +641,7 @@ Then you obtain the sparse embeddings like this:
 curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
     "model": "BAAI/bge-m3",
     "task": "token_classify",
-     "input": ["What is BGE M3?", "Defination of BM25"]
+     "input": ["What is BGE M3?", "Definition of BM25"]
 }'
 ```
@@ -395,7 +657,7 @@ You can obtain the colbert embeddings like this:
 curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
     "model": "BAAI/bge-m3",
     "task": "token_embed",
-     "input": ["What is BGE M3?", "Defination of BM25"]
+     "input": ["What is BGE M3?", "Definition of BM25"]
 }'
 ```

--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -179,8 +179,9 @@ class MyConfig(PretrainedConfig):
 Some model architectures are supported via vLLM plugins. These plugins extend vLLM's capabilities through the [plugin system](../design/plugin_system.md).
 | Architecture | Models | Plugin Repository |
-|--------------|--------|-------------------|
+| ------------ | ------ | ----------------- |
 | `BartForConditionalGeneration` | BART | [bart-plugin](https://github.com/vllm-project/bart-plugin) |
+| `Florence2ForConditionalGeneration` | Florence-2 | [bart-plugin](https://github.com/vllm-project/bart-plugin) |
 For other model architectures not natively supported, in particular for Encoder-Decoder models, we recommend following a similar pattern by implementing support through the plugin system.
@@ -362,15 +363,17 @@ th {
 </style>
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
-|--------------|--------|-------------------|----------------------|---------------------------|
+| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
 | `AfmoeForCausalLM` | Afmoe | TBA | ✅︎ | ✅︎ |
 | `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ |
 | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ |
 | `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ |
 | `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ |
+| `AXK1ForCausalLM` | A.X-K1 | `skt/A.X-K1`, etc. | | ✅︎ |
 | `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ |
 | `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ |
 | `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ |
+| `BailingMoeV2_5ForCausalLM` | Ling | `inclusionAI/Ling-2.5-1T`, `inclusionAI/Ring-2.5-1T` | | ✅︎ |
 | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ |
 | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ |
 | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `thu-coai/ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ |
@@ -384,7 +387,7 @@ th {
 | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ |
 | `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | ✅︎ | ✅︎ |
 | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ |
-| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ |
+| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. | ✅︎ | ✅︎ |
 | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ |
 | `ExaoneMoEForCausalLM` | K-EXAONE | `LGAI-EXAONE/K-EXAONE-236B-A23B`, etc. | | |
 | `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ |
@@ -415,6 +418,7 @@ th {
 | `Grok1ForCausalLM` | Grok2 | `xai-org/grok-2` | ✅︎ | ✅︎ |
 | `HunYuanDenseV1ForCausalLM` | Hunyuan Dense | `tencent/Hunyuan-7B-Instruct` | ✅︎ | ✅︎ |
 | `HunYuanMoEV1ForCausalLM` | Hunyuan-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ |
+| `HyperCLOVAXForCausalLM` | HyperCLOVAX-SEED-Think-14B | `naver-hyperclovax/HyperCLOVAX-SEED-Think-14B` | ✅︎ | ✅︎ |
 | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ |
 | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ |
 | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ |
@@ -424,18 +428,18 @@ th {
 | `Jais2ForCausalLM` | Jais2 | `inceptionai/Jais-2-8B-Chat`, `inceptionai/Jais-2-70B-Chat`, etc. | | ✅︎ |
 | `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ |
 | `KimiLinearForCausalLM` | Kimi-Linear-48B-A3B-Base, Kimi-Linear-48B-A3B-Instruct | `moonshotai/Kimi-Linear-48B-A3B-Base`, `moonshotai/Kimi-Linear-48B-A3B-Instruct` | | ✅︎ |
-| `Lfm2ForCausalLM`  | LFM2  | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ |
+| `Lfm2ForCausalLM` | LFM2 | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ |
-| `Lfm2MoeForCausalLM`  | LFM2MoE  | `LiquidAI/LFM2-8B-A1B-preview`, etc. | ✅︎ | ✅︎ |
+| `Lfm2MoeForCausalLM` | LFM2MoE | `LiquidAI/LFM2-8B-A1B-preview`, etc. | ✅︎ | ✅︎ |
 | `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ |
 | `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ |
 | `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ |
 | `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ |
 | `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ |
-| `MiMoV2FlashForCausalLM` | MiMoV2Flash | `XiaomiMiMo/MiMo-V2-Flash`, etc. | ︎| ✅︎ |
+| `MiMoV2FlashForCausalLM` | MiMoV2Flash | `XiaomiMiMo/MiMo-V2-Flash`, etc. | | ✅︎ |
 | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
 | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
 | `MiniMaxForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01-hf`, etc. | | |
-| `MiniMaxM2ForCausalLM` | MiniMax-M2, MiniMax-M2.1 |`MiniMaxAI/MiniMax-M2`, etc. | ✅︎ | ✅︎ |
+| `MiniMaxM2ForCausalLM` | MiniMax-M2, MiniMax-M2.1 | `MiniMaxAI/MiniMax-M2`, etc. | ✅︎ | ✅︎ |
 | `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
 | `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ |
 | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
@@ -445,13 +449,14 @@ th {
 | `OlmoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ |
 | `Olmo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ |
 | `Olmo3ForCausalLM` | OLMo3 | `allenai/Olmo-3-7B-Instruct`, `allenai/Olmo-3-32B-Think`, etc. | ✅︎ | ✅︎ |
+| `OlmoHybridForCausalLM` | OLMo Hybrid | `allenai/Olmo-Hybrid-7B` | ✅︎ | ✅︎ |
 | `OlmoeForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ |
 | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ |
 | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ |
 | `OuroForCausalLM` | ouro | `ByteDance/Ouro-1.4B`, `ByteDance/Ouro-2.6B`, etc. | ✅︎ | |
-| `PanguEmbeddedForCausalLM` |openPangu-Embedded-7B | `FreedomIntelligence/openPangu-Embedded-7B-V1.1` | ✅︎ | ✅︎ |
+| `PanguEmbeddedForCausalLM` | openPangu-Embedded-7B | `FreedomIntelligence/openPangu-Embedded-7B-V1.1` | ✅︎ | ✅︎ |
-| `PanguProMoEV2ForCausalLM` |openpangu-pro-moe-v2 | | ✅︎ | ✅︎ |
+| `PanguProMoEV2ForCausalLM` | openpangu-pro-moe-v2 | | ✅︎ | ✅︎ |
-| `PanguUltraMoEForCausalLM` |openpangu-ultra-moe-718b-model | `FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1` | ✅︎ | ✅︎ |
+| `PanguUltraMoEForCausalLM` | openpangu-ultra-moe-718b-model | `FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1` | ✅︎ | ✅︎ |
 | `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ |
 | `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ |
 | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ |
@@ -465,13 +470,15 @@ th {
 | `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ |
 | `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ |
 | `RWForCausalLM` | Falcon RW | `tiiuae/falcon-40b`, etc. | | ✅︎ |
+| `SarvamMoEForCausalLM` | Sarvam 2 | `sarvamai/sarvam2-30b-a3b`, etc. | ✅︎ | ✅︎ |
+| `SarvamMLAForCausalLM` | Sarvam 2 | `sarvamai/sarvam2-105b-a9b`, etc. | | ✅︎ |
 | `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ |
 | `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ |
 | `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | |
 | `StableLMEpochForCausalLM` | StableLM Epoch | `stabilityai/stablelm-zephyr-3b`, etc. | | ✅︎ |
 | `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ |
 | `Step1ForCausalLM` | Step-Audio | `stepfun-ai/Step-Audio-EditX`, etc. | ✅︎ | ✅︎ |
-| `Step3p5ForCausalLM` | Step-3.5-flash | `stepfun-ai/Step-3.5-Flash`, etc. |  | ✅︎ |
+| `Step3p5ForCausalLM` | Step-3.5-flash | `stepfun-ai/Step-3.5-Flash`, etc. | | ✅︎ |
 | `TeleChatForCausalLM` | TeleChat | `chuhac/TeleChat2-35B`, etc. | ✅︎ | ✅︎ |
 | `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ |
 | `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ |
@@ -486,7 +493,7 @@ th {
 Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
-|--------------|--------|-------------------|----------------------|---------------------------|
+| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
 | `SmolLM3ForCausalLM` | SmolLM3 | `HuggingFaceTB/SmolLM3-3B` | ✅︎ | ✅︎ |
 !!! note
@@ -505,16 +512,17 @@ See [this page](./pooling_models.md) for more information on how to use pooling
 These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API.
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
-|--------------|--------|-------------------|----------------------|---------------------------|
+| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
 | `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | |
 | `BertSpladeSparseEmbeddingModel` | SPLADE | `naver/splade-v3` | | |
+| `ErnieModel` | BERT-like Chinese ERNIE | `shibing624/text2vec-base-chinese-sentence` | | |
 | `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ |
 | `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ |
 | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
-| `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  |
+| `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | |
-| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  |
+| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | |
-| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. |  |  |
+| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | |
-| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. |  |  |
+| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | |
 | `LlamaBidirectionalModel`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-embed-1b-v2`, etc. | ✅︎ | ✅︎ |
 | `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ |
 | `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ |
@@ -549,9 +557,10 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 These models primarily support the [`LLM.classify`](./pooling_models.md#llmclassify) API.
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
-|--------------|--------|-------------------|----------------------|---------------------------|
+| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
-| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ |
+| `ErnieForSequenceClassification` | BERT-like Chinese ERNIE | `Forrest20231206/ernie-3.0-base-zh-cls` | | |
 | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | |
+| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
 <sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
@@ -566,8 +575,9 @@ Cross-encoder and reranker models are a subset of classification models that acc
 These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
 | Architecture | Models | Example HF Models | Score template (see note) | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
-|--------------|--------|-------------------|---------------------------|-----------------------------|-----------------------------------------|
+| ------------ | ------ | ----------------- | ------------------------- | --------------------------- | --------------------------------------- |
 | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | N/A | | |
+| `ErnieForSequenceClassification` | BERT-like Chinese ERNIE | `Forrest20231206/ernie-3.0-base-zh-cls` | N/A | | |
 | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma`(see note), etc. | [bge-reranker-v2-gemma.jinja](../../examples/pooling/score/template/bge-reranker-v2-gemma.jinja) | ✅︎ | ✅︎ |
 | `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | N/A | | |
 | `LlamaBidirectionalForSequenceClassification`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2`, etc. | [nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja) | ✅︎ | ✅︎ |
@@ -616,7 +626,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
 These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward) API.
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
-|--------------|--------|-------------------|----------------------|---------------------------|
+| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
 | `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ |
 | `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ |
 | `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ |
@@ -631,9 +641,10 @@ These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward)
 These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) API.
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
-|--------------|--------|-------------------|-----------------------------|-----------------------------------------|
+| ------------ | ------ | ----------------- | --------------------------- | --------------------------------------- |
-| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. |  |  |
+| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | |
-| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` |  |  |
+| `ErnieForTokenClassification` | BERT-like Chinese ERNIE | `gyr66/Ernie-3.0-base-chinese-finetuned-ner` | | |
+| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | |
 !!! note
    Named Entity Recognition (NER) usage, please refer to [examples/pooling/token_classify/ner_offline.py](../../examples/pooling/token_classify/ner_offline.py), [examples/pooling/token_classify/ner_online.py](../../examples/pooling/token_classify/ner_online.py).
@@ -658,7 +669,7 @@ On the other hand, modalities separated by `/` are mutually exclusive.
 See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inputs to the model.
 !!! tip
-    For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache.
+    For hybrid-only models such as Llama-4, Step3, Mistral-3 and Qwen-3.5, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (`--language-model-only`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache.
 !!! note
    vLLM currently supports adding LoRA adapters to the language backbone for most multimodal models. Additionally, vLLM now experimentally supports adding LoRA to the tower and connector modules for some multimodal models. See [this page](../features/lora.md).
@@ -672,7 +683,7 @@ See [this page](generative_models.md) for more information on how to use generat
 These models primarily accept the [`LLM.generate`](./generative_models.md#llmgenerate) API. Chat/Instruct models additionally support the [`LLM.chat`](./generative_models.md#llmchat) API.
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
-|--------------|--------|--------|-------------------|----------------------|---------------------------|
+| ------------ | ------ | ------ | ----------------- | -------------------- | ------------------------- |
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
 | `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-2601-hf` | ✅︎ | ✅︎ |
 | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
@@ -681,7 +692,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | ✅︎ | ✅︎ |
 | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |
 | `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ |
-| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ |
+| `DeepseekVLV2ForCausalLM` | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ |
 | `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | ✅︎ | ✅︎ |
 | `DeepseekOCR2ForCausalLM` | DeepSeek-OCR-2 | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR-2`, etc. | ✅︎ | ✅︎ |
 | `Eagle2_5_VLForConditionalGeneration` | Eagle2.5-VL | T + I<sup>E+</sup> | `nvidia/Eagle2.5-8B`, etc. | ✅︎ | ✅︎ |
@@ -692,8 +703,10 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ |
 | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ |
 | `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ |
+| `GlmOcrForConditionalGeneration` | GLM-OCR | T + I<sup>E+</sup> | `zai-org/GLM-OCR`, etc. | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
 | `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
+| `HCXVisionV2ForCausalLM` | HyperCLOVAX-SEED-Think-32B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Think-32B` | | |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
 | `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
@@ -705,9 +718,10 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `KananaVForConditionalGeneration` | Kanana-V | T + I<sup>+</sup> | `kakaocorp/kanana-1.5-v-3b-instruct`, etc. | | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ |
 | `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ |
-| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
+| `KimiAudioForConditionalGeneration` | Kimi-Audio | T + A<sup>+</sup> | `moonshotai/Kimi-Audio-7B-Instruct` | | ✅︎ |
 | `KimiK25ForConditionalGeneration` | Kimi-K2.5 | T + I<sup>+</sup> | `moonshotai/Kimi-K2.5` | | ✅︎ |
-| `LightOnOCRForConditionalGeneration`  | LightOnOCR-1B  | T + I<sup>+</sup> | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ |
+| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
+| `LightOnOCRForConditionalGeneration` | LightOnOCR-1B | T + I<sup>+</sup> | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ |
 | `Lfm2VlForConditionalGeneration` | LFM2-VL | T + I<sup>+</sup> | `LiquidAI/LFM2-VL-450M`, `LiquidAI/LFM2-VL-3B`, `LiquidAI/LFM2-VL-8B-A1B`, etc. | ✅︎ | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ |
 | `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ |
@@ -724,9 +738,11 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Molmo2ForConditionalGeneration` | Molmo2 | T + I<sup>+</sup> / V | `allenai/Molmo2-4B`, `allenai/Molmo2-8B`, `allenai/Molmo2-O-7B` | ✅︎ | ✅︎ |
 | `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
 | `OpenCUAForConditionalGeneration` | OpenCUA-7B | T + I<sup>E+</sup> | `xlangai/OpenCUA-7B` | ✅︎ | ✅︎ |
-| `OpenPanguVLForConditionalGeneration` | openpangu-VL | T + I<sup>E+</sup> + V<sup>E+</sup> |`FreedomIntelligence/openPangu-VL-7B` | ✅︎ | ✅︎ |
+| `OpenPanguVLForConditionalGeneration` | openpangu-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `FreedomIntelligence/openPangu-VL-7B` | ✅︎ | ✅︎ |
 | `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
 | `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
+| `Ovis2_6ForCausalLM` | Ovis2.6 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.6-2B`, etc. | | |
+| `Ovis2_6_MoeForCausalLM` | Ovis2.6 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.6-30B-A3B`, etc. | | |
 | `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |
 | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | ✅︎ | ✅︎ |
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
@@ -737,6 +753,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |
 | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |
 | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ |
+| `Qwen3_5ForConditionalGeneration` | Qwen3.5 | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3.5-9B-Instruct`, etc. | ✅︎ | ✅︎ |
+| `Qwen3_5MoeForConditionalGeneration` | Qwen3.5-MOE | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3.5-35B-A3B-Instruct`, etc. | ✅︎ | ✅︎ |
 | `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ |
 | `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ |
 | `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, `Qwen/Qwen3-Omni-30B-A3B-Thinking` | ✅︎ | ✅︎ |
@@ -753,13 +771,11 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
-|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|
+| ------------ | ------ | ------ | ----------------- | --------------------------- | --------------------------------------- |
 | `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ |
-<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
+<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.</br>
-&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:
+<sup>E</sup> Pre-computed embeddings can be inputted for this modality.</br>
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`
-<sup>E</sup> Pre-computed embeddings can be inputted for this modality.
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 !!! note
@@ -786,7 +802,9 @@ Some models are supported only via the [Transformers modeling backend](#transfor
 Speech2Text models trained specifically for Automatic Speech Recognition.
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
-|--------------|--------|-------------------|----------------------|---------------------------|
+| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
+| `FireRedASR2ForConditionalGeneration` | FireRedASR2 | `allendou/FireRedASR2-LLM-vllm`, etc. | | |
+| `FunASRForConditionalGeneration` | FunASR | `allendou/Fun-ASR-Nano-2512-vllm`, etc. | | |
 | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
 | `GlmAsrForConditionalGeneration` | GLM-ASR | `zai-org/GLM-ASR-Nano-2512` | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
@@ -812,8 +830,11 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 The following table lists those that are tested in vLLM.
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
-|--------------|--------|--------|-------------------|----------------------|---------------------------|
+| ------------ | ------ | ------ | ----------------- | -------------------- | ------------------------- |
 | `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
+| `ColModernVBertForRetrieval` | ColModernVBERT | T / I | `ModernVBERT/colmodernvbert-merged` | | |
+| `ColPaliForRetrieval` | ColPali | T / I | `vidore/colpali-v1.3-hf` | | |
+| `LlamaNemotronVLModel` | Llama Nemotron Embedding + SigLIP | T + I | `nvidia/llama-nemotron-embed-vl-1b-v2` | | |
 | `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
 | `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
 | `Qwen3VLForConditionalGeneration`<sup>C</sup> | Qwen3-VL | T + I + V | `Qwen/Qwen3-VL-Embedding-2B`, etc. | ✅︎ | ✅︎ |
@@ -831,8 +852,9 @@ Cross-encoder and reranker models are a subset of classification models that acc
 These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
-|--------------|--------|--------|-------------------|----------------------|---------------------------|
+| ------------ | ------ | ------ | ----------------- | -------------------- | ------------------------- |
 | `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ |
+| `LlamaNemotronVLForSequenceClassification` | Llama Nemotron Reranker + SigLIP | T + I<sup>E+</sup> | `nvidia/llama-nemotron-rerank-vl-1b-v2` | | |
 | `Qwen3VLForSequenceClassification` | Qwen3-VL-Reranker | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-Reranker-2B`(see note), etc. | ✅︎ | ✅︎ |
 <sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  

--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@@ -8,7 +8,7 @@ EP is typically coupled with Data Parallelism (DP). While DP can be used indepen
 Before using EP, you need to install the necessary dependencies. We are actively working on making this easier in the future:
-1. **Install DeepEP and pplx-kernels**: Set up host environment following vLLM's guide for EP kernels [here](../../tools/ep_kernels).
+1. **Install DeepEP**: Set up host environment following vLLM's guide for EP kernels [here](../../tools/ep_kernels).
 2. **Install DeepGEMM library**: Follow the [official instructions](https://github.com/deepseek-ai/DeepGEMM#installation).
 3. **For disaggregated serving**: Install `gdrcopy` by running the [`install_gdrcopy.sh`](../../tools/install_gdrcopy.sh) script (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/).
@@ -17,12 +17,12 @@ Before using EP, you need to install the necessary dependencies. We are actively
 vLLM provides multiple communication backends for EP. Use `--all2all-backend` to select one:
 | Backend | Use Case | Features | Best For |
-|---------|----------|----------|----------|
+| ------- | -------- | -------- | -------- |
 | `allgather_reducescatter` | Default backend | Standard all2all using allgather/reducescatter primitives | General purpose, works with any EP+DP configuration |
-| `pplx` | Single node | Chunked prefill support, efficient intra-node communication | Single-node deployments, development |
 | `deepep_high_throughput` | Multi-node prefill | Grouped GEMM with continuous layout, optimized for prefill | Prefill-dominated workloads, high-throughput scenarios |
 | `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout, optimized for decode | Decode-dominated workloads, low-latency scenarios |
-| `flashinfer_all2allv` | MNNVL systems | FlashInfer alltoallv kernels for multi-node NVLink | Systems with NVLink across nodes |
+| `flashinfer_nvlink_one_sided` | MNNVL systems | FlashInfer's one-sided A2A strategy for multi-node NVLink | High-throughput workloads |
+| `flashinfer_nvlink_two_sided` | MNNVL systems | FlashInfer's two-sided A2A strategy for multi-node NVLink | Systems with NVLink across nodes |
 | `naive` | Testing/debugging | Simple broadcast-based implementation | Debugging, not recommended for production |
 ## Single Node Deployment
@@ -49,7 +49,7 @@ Where:
 When EP is enabled, different layers in MoE models behave differently:
 | Layer Type | Behavior | Parallelism Used |
-|------------|----------|------------------|
+| ---------- | -------- | ---------------- |
 | **Expert (MoE) Layers** | Sharded across all EP ranks | Expert Parallel (EP) of size `TP × DP` |
 | **Attention Layers** | Behavior depends on TP size | See below |
@@ -71,12 +71,11 @@ For example, with `TP=2, DP=4` (8 GPUs total):
 The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parallel, 8-way (attention) data parallel, and 8-way expert parallel. The attention weights are replicated across all GPUs, while the expert weights are split across GPUs. It will work on a H200 (or H20) node with 8 GPUs. For H100, you can try to serve a smaller model or refer to the multi-node deployment section.
 ```bash
-# Single node EP deployment with pplx backend
+# Single node EP deployment
 vllm serve deepseek-ai/DeepSeek-V3-0324 \
    --tensor-parallel-size 1 \       # Tensor parallelism across 1 GPU
    --data-parallel-size 8 \         # Data parallelism across 8 processes
-    --enable-expert-parallel \       # Enable expert parallelism
+    --enable-expert-parallel         # Enable expert parallelism
-    --all2all-backend pplx           # Use pplx communication backend
 ```
 ## Multi-Node Deployment
@@ -148,9 +147,9 @@ When enabled, vLLM collects load statistics with every forward pass and periodic
 Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. The available keys and their descriptions are:
 | Parameter | Description | Default |
-|-----------|-------------|---------|
+| --------- | ----------- | ------- |
-| `window_size`| Number of engine steps to track for rebalancing decisions | 1000 |
+| `window_size` | Number of engine steps to track for rebalancing decisions | 1000 |
-| `step_interval`| Frequency of rebalancing (every N engine steps) | 3000 |
+| `step_interval` | Frequency of rebalancing (every N engine steps) | 3000 |
 | `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
 | `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` |
 | `use_async` | Use non-blocking EPLB for reduced latency overhead | `false` |
@@ -197,7 +196,6 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \
    --tensor-parallel-size 1 \       # Tensor parallelism
    --data-parallel-size 8 \         # Data parallelism
    --enable-expert-parallel \       # Enable EP
-    --all2all-backend pplx \         # Use pplx communication backend
    --enable-eplb \                  # Enable load balancer
    --eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}'
 ```

--- a/docs/serving/integrations/claude_code.md
+++ b/docs/serving/integrations/claude_code.md
@@ -60,6 +60,9 @@ The environment variables:
 !!! tip
    You can add these environment variables to your shell profile (e.g., `.bashrc`, `.zshrc`), Claude Code configuration file (`~/.claude/settings.json`), or create a wrapper script for convenience.
+!!! warning
+    Claude Code recently started injecting a per-request hash in the system prompt, which can defeat [prefix caching](../../design/prefix_caching.md) because the prompt changes on every request, causing greatly reduced performance. This is addressed automatically in vLLM versions > 0.17.1 but for older versions `"CLAUDE_CODE_ATTRIBUTION_HEADER": "0"` should be added to the `"env"` section of `~/.claude/settings.json` (see this [blog post](https://unsloth.ai/docs/basics/claude-code#fixing-90-slower-inference-in-claude-code) from Unsloth).
 ## Testing the Setup
 Once Claude Code launches, try a simple prompt to verify the connection:

--- a/docs/serving/integrations/llamaindex.md
+++ b/docs/serving/integrations/llamaindex.md
@@ -17,7 +17,7 @@ llm = Vllm(
    model="microsoft/Orca-2-7b",
    tensor_parallel_size=4,
    max_new_tokens=100,
-    vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
+    vllm_kwargs={"gpu_memory_utilization": 0.5},
 )
 ```

--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -72,6 +72,9 @@ In addition, we have the following custom APIs:
    - Only applicable to [classification models](../models/pooling_models.md).
 - [Score API](#score-api) (`/score`)
    - Applicable to [embedding models and cross-encoder models](../models/pooling_models.md).
+- [Cohere Embed API](#cohere-embed-api) (`/v2/embed`)
+    - Compatible with [Cohere's Embed API](https://docs.cohere.com/reference/embed)
+    - Works with any [embedding model](../models/pooling_models.md), including multimodal models.
 - [Re-rank API](#re-rank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
    - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
    - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
@@ -84,7 +87,7 @@ In order for the language model to support chat protocol, vLLM requires the mode
 a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
 specifies how roles, messages, and other chat-specific tokens are encoded in the input.
-An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
+An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://llama.com/docs/model-cards-and-prompt-formats/meta-llama-3/#prompt-template-for-meta-llama-3)
 Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those models,
 you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
@@ -190,7 +193,7 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct --enable-offline-docs
 Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
-Code example: [examples/online_serving/openai_completion_client.py](../../examples/online_serving/openai_completion_client.py)
+Code example: [examples/basic/online_serving/openai_completion_client.py](../../examples/basic/online_serving/openai_completion_client.py)
 #### Extra parameters
@@ -221,7 +224,7 @@ see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more inf
 - *Note: `image_url.detail` parameter is not supported.*
-Code example: [examples/online_serving/openai_chat_completion_client.py](../../examples/online_serving/openai_chat_completion_client.py)
+Code example: [examples/basic/online_serving/openai_chat_completion_client.py](../../examples/basic/online_serving/openai_chat_completion_client.py)
 #### Extra parameters
@@ -429,6 +432,137 @@ these extra parameters are supported instead:
    --8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-extra-params"
    ```
+### Cohere Embed API
+Our API is also compatible with [Cohere's Embed v2 API](https://docs.cohere.com/reference/embed) which adds support for some modern embedding feature such as truncation, output dimensions, embedding types, and input types. This endpoint works with any embedding model (including multimodal models).
+#### Cohere Embed API request parameters
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `model` | string | Yes | Model name |
+| `input_type` | string | No | Prompt prefix key (model-dependent, see below) |
+| `texts` | list[string] | No | Text inputs (use one of `texts`, `images`, or `inputs`) |
+| `images` | list[string] | No | Base64 data URI images |
+| `inputs` | list[object] | No | Mixed text and image content objects |
+| `embedding_types` | list[string] | No | Output types (default: `["float"]`) |
+| `output_dimension` | int | No | Truncate embeddings to this dimension (Matryoshka) |
+| `truncate` | string | No | `END`, `START`, or `NONE` (default: `END`) |
+#### Text embedding
+```bash
+curl -X POST "http://localhost:8000/v2/embed" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Snowflake/snowflake-arctic-embed-m-v1.5",
+    "input_type": "query",
+    "texts": ["Hello world", "How are you?"],
+    "embedding_types": ["float"]
+  }'
+```
+??? console "Response"
+    ```json
+    {
+      "id": "embd-...",
+      "embeddings": {
+        "float": [
+          [0.012, -0.034, ...],
+          [0.056, 0.078, ...]
+        ]
+      },
+      "texts": ["Hello world", "How are you?"],
+      "meta": {
+        "api_version": {"version": "2"},
+        "billed_units": {"input_tokens": 12}
+      }
+    }
+    ```
+#### Mixed text and image inputs
+For multimodal models, you can embed images by passing base64 data URIs. The `inputs` field accepts a list of objects with mixed text and image content:
+```bash
+curl -X POST "http://localhost:8000/v2/embed" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "google/siglip-so400m-patch14-384",
+    "inputs": [
+      {
+        "content": [
+          {"type": "text", "text": "A photo of a cat"},
+          {"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}}
+        ]
+      }
+    ],
+    "embedding_types": ["float"]
+  }'
+```
+#### Embedding types
+The `embedding_types` parameter controls the output format. Multiple types can be requested in a single call:
+| Type | Description |
+| ---- | ----------- |
+| `float` | Raw float32 embeddings (default) |
+| `binary` | Bit-packed signed binary |
+| `ubinary` | Bit-packed unsigned binary |
+| `base64` | Little-endian float32 encoded as base64 |
+```bash
+curl -X POST "http://localhost:8000/v2/embed" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Snowflake/snowflake-arctic-embed-m-v1.5",
+    "input_type": "query",
+    "texts": ["What is machine learning?"],
+    "embedding_types": ["float", "binary"]
+  }'
+```
+??? console "Response"
+    ```json
+    {
+      "id": "embd-...",
+      "embeddings": {
+        "float": [[0.012, -0.034, ...]],
+        "binary": [[42, -117, ...]]
+      },
+      "texts": ["What is machine learning?"],
+      "meta": {
+        "api_version": {"version": "2"},
+        "billed_units": {"input_tokens": 8}
+      }
+    }
+    ```
+#### Truncation
+The `truncate` parameter controls how inputs exceeding the model's maximum sequence length are handled:
+| Value | Behavior |
+| ----- | --------- |
+| `END` (default) | Keep the first tokens, drop the end |
+| `START` | Keep the last tokens, drop the beginning |
+| `NONE` | Return an error if the input is too long |
+#### Input type and prompt prefixes
+The `input_type` field selects a prompt prefix to prepend to each text input. The available values
+depend on the model:
+- **Models with `task_instructions` in `config.json`**: The keys from the `task_instructions` dict are
+  the valid `input_type` values and the corresponding value is prepended to each text.
+- **Models with `config_sentence_transformers.json` prompts**: The keys from the `prompts` dict are
+  the valid `input_type` values. For example, `Snowflake/snowflake-arctic-embed-xs` defines `"query"`,
+  so setting `input_type: "query"` prepends `"Represent this sentence for searching relevant passages: "`.
+- **Other models**: `input_type` is not accepted and will raise a validation error if passed.
 ### Transcriptions API
 Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription);
@@ -439,6 +573,8 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 Code example: [examples/online_serving/openai_transcription_client.py](../../examples/online_serving/openai_transcription_client.py)
+NOTE: beam search is currently supported in the transcriptions endpoint for encoder-decoder multimodal models, e.g., whisper, but highly inefficient as work for handling the encoder/decoder cache is actively ongoing. This is an active point of ongoing optimization and will be handled properly in the very near future.
 #### API Enforced Limits
 Set the maximum audio file size (in MB) that VLLM will accept, via the
@@ -596,7 +732,7 @@ Audio must be sent as base64-encoded PCM16 audio at 16kHz sample rate, mono chan
 #### Client → Server Events
 | Event | Description |
-|-------|-------------|
+| ----- | ----------- |
 | `input_audio_buffer.append` | Send base64-encoded audio chunk: `{"type": "input_audio_buffer.append", "audio": "<base64>"}` |
 | `input_audio_buffer.commit` | Trigger transcription processing or end: `{"type": "input_audio_buffer.commit", "final": bool}` |
 | `session.update` | Configure session: `{"type": "session.update", "model": "model-name"}` |
@@ -604,7 +740,7 @@ Audio must be sent as base64-encoded PCM16 audio at 16kHz sample rate, mono chan
 #### Server → Client Events
 | Event | Description |
-|-------|-------------|
+| ----- | ----------- |
 | `session.created` | Connection established with session ID and timestamp |
 | `transcription.delta` | Incremental transcription text: `{"type": "transcription.delta", "delta": "text"}` |
 | `transcription.done` | Final transcription with usage stats |

--- a/docs/serving/parallelism_scaling.md
+++ b/docs/serving/parallelism_scaling.md
@@ -68,6 +68,12 @@ vLLM uses Ray to manage the distributed execution of tasks across multiple nodes
 Ray also offers high-level APIs for large-scale [offline batch inference](https://docs.ray.io/en/latest/data/working-with-llms.html) and [online serving](https://docs.ray.io/en/latest/serve/llm) that can leverage vLLM as the engine. These APIs add production-grade fault tolerance, scaling, and distributed observability to vLLM workloads.
+Ray is an optional dependency. Install it explicitly before using Ray-based execution, for example:
+```bash
+pip install "ray[cgraph]"
+```
 For details, see the [Ray documentation](https://docs.ray.io/en/latest/index.html).
 ### Ray cluster setup with containers

--- a/docs/usage/metrics.md
+++ b/docs/usage/metrics.md
@@ -45,6 +45,12 @@ The following metrics are exposed:
 --8<-- "docs/generated/metrics/nixl_connector.inc.md"
+## Model Flops Utilization (MFU) Performance Metrics
+These metrics are available via `--enable-mfu-metrics`:
+--8<-- "docs/generated/metrics/perf.inc.md"
 ## Deprecation Policy
 Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`

--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@@ -41,20 +41,20 @@ Key points from the PyTorch security guide:
 - Messages are sent unencrypted
 - Connections are accepted from anywhere without checks
-### Security Recommendations
+## Security Recommendations
-#### 1. **Network Isolation:**
+### 1. **Network Isolation:**
 - Deploy vLLM nodes on a dedicated, isolated network
 - Use network segmentation to prevent unauthorized access
 - Implement appropriate firewall rules
-#### 2. **Configuration Best Practices:**
+### 2. **Configuration Best Practices:**
 - Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults
 - Configure firewalls to only allow necessary ports between nodes
-#### 3. **Access Control:**
+### 3. **Access Control:**
 - Restrict physical and network access to the deployment environment
 - Implement proper authentication and authorization for management interfaces
@@ -66,6 +66,18 @@ Restrict domains that vLLM can access for media URLs by setting
 `--allowed-media-domains` to prevent Server-Side Request Forgery (SSRF) attacks.
 (e.g. `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`)
+Without domain restrictions, a malicious user could supply URLs that:
+- **Target internal services**: Access internal network endpoints, cloud metadata
+  services (e.g. `169.254.169.254`), or other services not intended to be
+  publicly reachable (SSRF).
+- **Consume excessive resources**: Point to extremely large files or slow
+  endpoints, causing the server to download unbounded amounts of data and
+  exhausting memory, disk, or network bandwidth.
+By explicitly allowlisting only the domains you expect media to come from, you
+significantly reduce the attack surface for these types of abuse.
 Also, consider setting `VLLM_MEDIA_URL_ALLOW_REDIRECTS=0` to prevent HTTP
 redirects from being followed to bypass domain restrictions.
@@ -219,6 +231,47 @@ The most effective approach is to deploy vLLM behind a reverse proxy (such as ng
 - Blocks all other endpoints, including the unauthenticated inference and operational control endpoints
 - Implements additional authentication, rate limiting, and logging at the proxy layer
+## Tool Server and MCP Security
+vLLM supports connecting to external tool servers via the `--tool-server` argument. This enables models to call tools through the Responses API (`/v1/responses`). Tool server support works with all models — it is not limited to specific model architectures.
+**Important:** No tool servers are enabled by default. They must be explicitly opted into via configuration.
+### Built-in Demo Tools (GPT-OSS)
+Passing `--tool-server demo` enables built-in demo tools that work with any model that supports tool calling. The tool implementations are not part of vLLM — they are provided by the separately installed [`gpt-oss`](https://github.com/openai/gpt-oss) package. vLLM provides thin wrappers that delegate to `gpt-oss`.
+- **Code interpreter** (`python`): Python execution via Docker (via `gpt_oss.tools.python_docker`)
+- **Web browser** (`browser`): Search via Exa API, requires `EXA_API_KEY` (via `gpt_oss.tools.simple_browser`)
+#### Code Interpreter (Python Tool) Security Risks
+The code interpreter executes model-generated code inside a Docker container. However, the container is **not configured with network isolation by default**. It inherits the host's Docker networking configuration (e.g., default bridge network or `--network=host`), which means:
+- The container may be able to access the host network and LAN.
+- Internal services reachable from the container may be exploited via SSRF (Server-Side Request Forgery).
+- Cloud metadata services (e.g., `169.254.169.254`) may be accessible.
+- If vulnerable internal services (such as `torch.distributed` endpoints) are reachable from the container, this could be used to attack them.
+This is particularly concerning because the code being executed is generated by the model, which may be influenced by adversarial inputs (prompt injection).
+#### Controlling Built-in Tool Availability
+Built-in demo tools are controlled by two settings:
+1. **`--tool-server demo`**: Enables the built-in demo tools (browser and Python code interpreter).
+2. **`VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS`**: When built-in tools are requested via the `mcp` tool type in the Responses API, this comma-separated allowlist controls which tool labels are permitted. Valid values are:
+   - `container` - Container tool
+   - `code_interpreter` - Python code execution tool
+   - `web_search_preview` - Web search/browser tool
+   If this variable is not set or is empty, no built-in tools requested via MCP tool type will be enabled.
+To disable the Python code interpreter specifically, omit `code_interpreter` from `VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS`.
+**Consider a custom implementation**: The GPT-OSS Python tool is a reference implementation. For production deployments, consider implementing a custom code execution sandbox with stricter isolation guarantees. See the [GPT-OSS documentation](https://github.com/openai/gpt-oss?tab=readme-ov-file#python) for guidance.
 ## Reporting Security Vulnerabilities
 If you believe you have found a security vulnerability in vLLM, please report it following the project's security policy. For more information on how to report security issues and the project's security policy, please see the [vLLM Security Policy](https://github.com/vllm-project/vllm/blob/main/SECURITY.md).
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -91,11 +91,11 @@ If GPU/CPU communication cannot be established, you can use the following Python
    import torch
    import torch.distributed as dist
    dist.init_process_group(backend="nccl")
-    local_rank = dist.get_rank() % torch.cuda.device_count()
+    local_rank = dist.get_rank() % torch.accelerator.device_count()
-    torch.cuda.set_device(local_rank)
+    torch.accelerator.set_device_index(local_rank)
    data = torch.FloatTensor([1,] * 128).to("cuda")
    dist.all_reduce(data, op=dist.ReduceOp.SUM)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    value = data.mean().item()
    world_size = dist.get_world_size()
    assert value == world_size, f"Expected {world_size}, got {value}"
@@ -155,26 +155,24 @@ If you are testing with a single node, adjust `--nproc-per-node` to the number o
 NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py
 ```
-If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run:
+If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address and port of the master node (e.g., `10.0.0.1:29400`), reachable from all nodes. Then, run:
 ```bash
 NCCL_DEBUG=TRACE torchrun --nnodes 2 \
    --nproc-per-node=2 \
-    --rdzv_backend=c10d \
+    --rdzv_backend=static \
-    --rdzv_endpoint=$MASTER_ADDR test.py
+    --rdzv_endpoint=$MASTER_ADDR \
+    --node-rank $NODE_RANK test.py
 ```
-If the script runs successfully, you should see the message `sanity check is successful!`.
+Set `MASTER_ADDR` to the IP address and port of the master node (e.g., `10.0.0.1:29400`), reachable from all nodes. Set `NODE_RANK` to `0` on the master node and `1`, `2`, ... on the workers. Adjust `--nproc-per-node` and `--nnodes` according to your setup.
-If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully.
 !!! note
-    A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
+    We use `--rdzv_backend=static` instead of `c10d` because the `c10d` rendezvous backend can fail with DNS resolution errors in multi-node setups (see [pytorch/pytorch#85300](https://github.com/pytorch/pytorch/issues/85300)). The `static` backend avoids this by requiring explicit node ranks.
-    - In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`.
+If the script runs successfully, you should see the message `sanity check is successful!`.
-    - In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`.
-    Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
+If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully.
 ## Python multiprocessing
@@ -318,7 +316,32 @@ This indicates vLLM failed to initialize the NCCL communicator, possibly due to
 ## CUDA error: the provided PTX was compiled with an unsupported toolchain
-If you see an error like `RuntimeError: CUDA error: the provided PTX was compiled with an unsupported toolchain.`, it means that the CUDA PTX in vLLM's wheels was compiled with a toolchain unsupported by your system. The released vLLM wheels have to be compiled with a specific version of CUDA toolkit, and the compiled code might fail to run on lower versions of CUDA drivers. Read [cuda compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/) for more details. The solution is to install `cuda-compat` package from your package manager. For example, on Ubuntu, you can run `sudo apt-get install cuda-compat-12-9`, and then add `export LD_LIBRARY_PATH=/usr/local/cuda-12.9/compat:$LD_LIBRARY_PATH` to your `.bashrc` file. When successfully installed, you should see that the output of `nvidia-smi` will show `CUDA Version: 12.9`. Note that we use CUDA 12.9 as an example here, you may want to install a higher version of cuda-compat package in case vLLM's default CUDA version goes higher.
+If you see an error like `RuntimeError: CUDA error: the provided PTX was compiled with an unsupported toolchain`, it means that the CUDA PTX in vLLM's wheels was compiled with a toolchain unsupported by your system. This section also applies if you get the error `RuntimeError: The NVIDIA driver on your system is too old`.
+The released vLLM wheels are compiled with a specific version of CUDA toolkit, and the compiled code might fail to run on lower versions of CUDA drivers. Read [CUDA compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/) for more details. **This is only supported on select professional and datacenter NVIDIA GPUs.**
+If you are using the vLLM official Docker image, you can solve this by adding `-e VLLM_ENABLE_CUDA_COMPATIBILITY=1` to your `docker run` command. This will enable the pre-installed CUDA forward compatibility libraries.
+If you are running vLLM outside of Docker, the solution is to install the `cuda-compat` package from your package manager with the [CUDA repository](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/) enabled. For example, on Ubuntu, you can run `sudo apt-get install cuda-compat-12-9`, and then set `export VLLM_ENABLE_CUDA_COMPATIBILITY=1` and `export VLLM_CUDA_COMPATIBILITY_PATH="/usr/local/cuda-12.9/compat"`.
+On Conda, you can install the `conda-forge::cuda-compat` package (e.g., `conda install -c conda-forge cuda-compat=12.9`), then after activating the environment, set `export VLLM_ENABLE_CUDA_COMPATIBILITY=1` and `export VLLM_CUDA_COMPATIBILITY_PATH="${CONDA_PREFIX}/cuda-compat"`.
+You can verify the configuration works by running a minimal Python script that initializes CUDA via vLLM:
+```bash
+export VLLM_ENABLE_CUDA_COMPATIBILITY=1
+export VLLM_CUDA_COMPATIBILITY_PATH="/usr/local/cuda-12.9/compat"
+python3 - << 'EOF'
+import vllm
+import torch
+print(f"CUDA available: {torch.cuda.is_available()}")
+print(f"CUDA device count: {torch.accelerator.device_count()}")
+EOF
+```
+Note that we use CUDA 12.9 as an example here, and you may want to install a higher version of cuda-compat package in case vLLM's default CUDA version goes higher.
 ## ptxas fatal: Value 'sm_110a' is not defined for option 'gpu-name'

--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -83,13 +83,13 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 ### Hardware
-| Hardware         | Status                                        |
+| Hardware      | Status          |
-|------------------|-----------------------------------------------|
+| --------------| --------------- |
-| **NVIDIA**       | <nobr>🟢</nobr>                               |
+| **NVIDIA**    | <nobr>🟢</nobr> |
-| **AMD**          | <nobr>🟢</nobr>                               |
+| **AMD**       | <nobr>🟢</nobr> |
-| **INTEL GPU**    | <nobr>🟢</nobr>                               |
+| **INTEL GPU** | <nobr>🟢</nobr> |
-| **TPU**          | <nobr>🟢</nobr>                               |
+| **TPU**       | <nobr>🟢</nobr> |
-| **CPU**          | <nobr>🟢</nobr>                               |
+| **CPU**       | <nobr>🟢</nobr> |
 !!! note
@@ -104,13 +104,13 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 ### Models
-| Model Type                  | Status                                                                  |
+| Model Type                 | Status                                  |
-|-----------------------------|-------------------------------------------------------------------------|
+| -------------------------- | --------------------------------------- |
-| **Decoder-only Models**     | <nobr>🟢</nobr>                                                         |
+| **Decoder-only Models**    | <nobr>🟢</nobr>                         |
-| **Encoder-Decoder Models**  | <nobr>🟢 (Whisper), 🔴 (Others) </nobr>                                |
+| **Encoder-Decoder Models** | <nobr>🟢 (Whisper), 🔴 (Others) </nobr> |
-| **Pooling Models**          | <nobr>🟢</nobr>                                                         |
+| **Pooling Models**         | <nobr>🟢</nobr>                         |
-| **Mamba Models**            | <nobr>🟢</nobr>                                                         |
+| **Mamba Models**           | <nobr>🟢</nobr>                         |
-| **Multimodal Models**       | <nobr>🟢</nobr>                                                         |
+| **Multimodal Models**      | <nobr>🟢</nobr>                         |
 See below for the status of models that are not yet supported or have more features planned in V1.
@@ -137,6 +137,7 @@ Please note that prefix caching is not yet supported for any of the above models
 Whisper is supported natively. Other encoder-decoder models are supported via the plugin system:
 - **BART**: `BartForConditionalGeneration` is supported via the official [bart-plugin](https://github.com/vllm-project/bart-plugin).
+- **Florence-2**: `Florence2ForConditionalGeneration` is supported via the official [bart-plugin](https://github.com/vllm-project/bart-plugin).
 For other encoder-decoder models (e.g., `MllamaForConditionalGeneration`), we recommend
 following a similar pattern by implementing support through the [plugin system](../design/plugin_system.md).
@@ -144,7 +145,7 @@ following a similar pattern by implementing support through the [plugin system](
 ### Features
 | Feature                                     | Status                                                                            |
-|---------------------------------------------|-----------------------------------------------------------------------------------|
+| ------------------------------------------- | --------------------------------------------------------------------------------- |
 | **Prefix Caching**                          | <nobr>🟢 Functional</nobr>                                                        |
 | **Chunked Prefill**                         | <nobr>🟢 Functional</nobr>                                                        |
 | **LoRA**                                    | <nobr>🟢 Functional</nobr>                                                        |

--- a/examples/offline_inference/basic/README.md
+++ b/examples/offline_inference/basic/README.md
-# Basic
+# Offline Inference
 The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server.
@@ -7,31 +7,31 @@ The `LLM` class provides the primary Python interface for doing offline inferenc
 The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here.
 ```bash
-python examples/offline_inference/basic/basic.py
+python examples/basic/offline_inference/basic.py
 ```
 The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments.
 ```bash
-python examples/offline_inference/basic/classify.py
+python examples/basic/offline_inference/classify.py
 ```
 ```bash
-python examples/offline_inference/basic/embed.py
+python examples/basic/offline_inference/embed.py
 ```
 ```bash
-python examples/offline_inference/basic/score.py
+python examples/basic/offline_inference/score.py
 ```
 The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`.
 ```bash
-python examples/offline_inference/basic/chat.py
+python examples/basic/offline_inference/chat.py
 ```
 ```bash
-python examples/offline_inference/basic/generate.py
+python examples/basic/offline_inference/generate.py
 ```
 ## Features