From b48d5cca16a5583d58e998ecd52ad949c450a3b9 Mon Sep 17 00:00:00 2001
From: Carol Zheng <cazheng@google.com>
Date: Tue, 27 May 2025 14:54:59 -0700
Subject: [PATCH 001/274] [CI/Build] [TPU] Fix TPU CI exit code (#18282)

Signed-off-by: Carol Zheng <cazheng@google.com>
---
 .../scripts/hardware_ci/run-tpu-v1-test.sh    | 217 +++++++++++-------
 1 file changed, 133 insertions(+), 84 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 2d375d7e9..eb82da3a8 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -13,91 +13,140 @@ remove_docker_container
 
 # For HF_TOKEN.
 source /etc/environment
-# Run a simple end-to-end example.
+
 docker run --privileged --net host --shm-size=16G -it \
     -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
-    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install pytest pytest-asyncio tpu-info \
-    && python3 -m pip install lm_eval[api]==0.4.4 \
-    && export VLLM_XLA_CACHE_PATH= \
-    && export VLLM_USE_V1=1 \
-    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
-    && echo HARDWARE \
-    && tpu-info \
-    && { \
-        echo TEST_0: Running test_perf.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
-        echo TEST_0_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_1: Running test_compilation.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
-        echo TEST_1_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_2: Running test_basic.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
-        echo TEST_2_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
-        python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
-        echo TEST_3_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_4: Running test_quantization_accuracy.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
-        echo TEST_4_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_5: Running examples/offline_inference/tpu.py; \
-        python3 /workspace/vllm/examples/offline_inference/tpu.py; \
-        echo TEST_5_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_6: Running test_tpu_model_runner.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
-        echo TEST_6_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_7: Running test_sampler.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
-        echo TEST_7_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_8: Running test_topk_topp_sampler.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
-        echo TEST_8_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_9: Running test_multimodal.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
-        echo TEST_9_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_10: Running test_pallas.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
-        echo TEST_10_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_11: Running test_struct_output_generate.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
-        echo TEST_11_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_12: Running test_moe_pallas.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
-        echo TEST_12_EXIT_CODE: \$?; \
-    } & \
-    # Disable the TPU LoRA tests until the feature is activated
-    # & { \
-    #     echo TEST_13: Running test_moe_pallas.py; \
-    #     python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
-    #     echo TEST_13_EXIT_CODE: \$?; \
-    # } & \
-    wait \
-    && echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
-"
+    vllm-tpu /bin/bash -c '
+set -e # Exit immediately if a command exits with a non-zero status.
+set -u # Treat unset variables as an error.
+
+echo "--- Starting script inside Docker container ---"
+
+# Create results directory
+RESULTS_DIR=$(mktemp -d)
+# If mktemp fails, set -e will cause the script to exit.
+echo "Results will be stored in: $RESULTS_DIR"
+
+# Install dependencies
+echo "--- Installing Python dependencies ---"
+python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
+    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
+    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
+echo "--- Python dependencies installed ---"
+export VLLM_USE_V1=1
+export VLLM_XLA_CHECK_RECOMPILATION=1
+export VLLM_XLA_CACHE_PATH=
+echo "Using VLLM V1"
+
+echo "--- Hardware Information ---"
+tpu-info
+echo "--- Starting Tests ---"
+set +e
+overall_script_exit_code=0
+
+# --- Test Definitions ---
+# If a test fails, this function will print logs and will not cause the main script to exit.
+run_test() {
+    local test_num=$1
+    local test_name=$2
+    local test_command=$3
+    local log_file="$RESULTS_DIR/test_${test_num}.log"
+    local actual_exit_code
+
+    echo "--- TEST_$test_num: Running $test_name ---"
+    
+    # Execute the test command.
+    eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
+    actual_exit_code=$?
+
+    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
+    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
+
+    if [ "$actual_exit_code" -ne 0 ]; then
+        echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
+        echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
+        if [ -f "$log_file" ]; then
+            cat "$log_file" >&2
+        else
+            echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
+        fi
+        echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
+        return "$actual_exit_code" # Return the failure code
+    else
+        echo "TEST_$test_num ($test_name) PASSED."
+        return 0 # Return success
+    fi
+}
+
+# Helper function to call run_test and update the overall script exit code
+run_and_track_test() {
+    local test_num_arg="$1"
+    local test_name_arg="$2"
+    local test_command_arg="$3"
+
+    # Run the test
+    run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
+    local test_specific_exit_code=$?
+
+    # If the test failed, set the overall script exit code to 1
+    if [ "$test_specific_exit_code" -ne 0 ]; then
+        # No need for extra echo here, run_test already logged the failure.
+        overall_script_exit_code=1
+    fi
+}
+
+# --- Actual Test Execution ---
+run_and_track_test 0 "test_perf.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
+run_and_track_test 1 "test_compilation.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
+run_and_track_test 2 "test_basic.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
+run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
+run_and_track_test 4 "test_quantization_accuracy.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
+run_and_track_test 5 "examples/offline_inference/tpu.py" \
+    "python3 /workspace/vllm/examples/offline_inference/tpu.py"
+run_and_track_test 6 "test_tpu_model_runner.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
+run_and_track_test 7 "test_sampler.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
+run_and_track_test 8 "test_topk_topp_sampler.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
+run_and_track_test 9 "test_multimodal.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
+run_and_track_test 10 "test_pallas.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
+run_and_track_test 11 "test_struct_output_generate.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
+run_and_track_test 12 "test_moe_pallas.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
+
+# Disable the TPU LoRA tests until the feature is activated
+# run_and_track_test 13 "test_lora (directory)" \
+#     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/"
+
+# After all tests have been attempted, exit with the overall status.
+if [ "$overall_script_exit_code" -ne 0 ]; then
+    echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
+else
+    echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
+fi
+exit "$overall_script_exit_code"
+' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
+
+# Capture the exit code of the docker run command
+DOCKER_RUN_EXIT_CODE=$?
 
+# The trap will run for cleanup.
+# Exit the main script with the Docker run command's exit code.
+if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
+    echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
+    exit "$DOCKER_RUN_EXIT_CODE"
+else
+    echo "Docker run command completed successfully."
+    exit 0
+fi
 # TODO: This test fails because it uses RANDOM_SEED sampling
-# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
-- 
GitLab


From e0cbad4e30d7df4e9ee4634939ce19042639b735 Mon Sep 17 00:00:00 2001
From: Satyajith Chilappagari <satchill@amazon.com>
Date: Tue, 27 May 2025 15:10:33 -0700
Subject: [PATCH 002/274] [Neuron] Support quantization on neuron (#18283)

Signed-off-by: Satyajith Chilappagari <satchill@amazon.com>
---
 tests/neuron/1_core/test_neuron_quant.py              | 11 +++++++++++
 .../layers/quantization/neuron_quant.py               |  9 ++++++++-
 vllm/platforms/neuron.py                              |  2 +-
 3 files changed, 20 insertions(+), 2 deletions(-)
 create mode 100644 tests/neuron/1_core/test_neuron_quant.py

diff --git a/tests/neuron/1_core/test_neuron_quant.py b/tests/neuron/1_core/test_neuron_quant.py
new file mode 100644
index 000000000..68f0cb805
--- /dev/null
+++ b/tests/neuron/1_core/test_neuron_quant.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+from vllm.model_executor.layers.quantization.neuron_quant import (
+    NeuronQuantConfig)
+
+
+def test_get_supported_act_dtypes():
+    neuron_quant_config = NeuronQuantConfig()
+    supported_act_dtypes = neuron_quant_config.get_supported_act_dtypes()
+    target_list = ["any_dtype1", "any_dtype2"]
+    for dtype in target_list:
+        assert dtype in supported_act_dtypes
diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py
index 38b374fee..b2d6bf5db 100644
--- a/vllm/model_executor/layers/quantization/neuron_quant.py
+++ b/vllm/model_executor/layers/quantization/neuron_quant.py
@@ -13,6 +13,12 @@ from vllm.model_executor.layers.quantization.base_config import (
 SUPPORTED_QUANT_DTYPE_LIST = ['s8', 'f8e4m3fn']
 
 
+class AlwaysSupportedDtypes(list):
+
+    def __contains__(self, item):
+        return True
+
+
 class NeuronQuantConfig(QuantizationConfig):
     """Int8 Quantization Config class for Neuron Backend."""
 
@@ -35,7 +41,8 @@ class NeuronQuantConfig(QuantizationConfig):
         return "neuron_quant"
 
     def get_supported_act_dtypes(self) -> list[str]:
-        return SUPPORTED_QUANT_DTYPE_LIST
+        # Neuron implements custom handling logic for quantization support
+        return AlwaysSupportedDtypes()
 
     @classmethod
     def get_min_capability(cls) -> int:
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 9cd49fd34..474c70d04 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -28,7 +28,7 @@ class NeuronPlatform(Platform):
     device_name: str = "neuron"
     device_type: str = "neuron"
     ray_device_key: str = "neuron_cores"
-    supported_quantization: list[str] = ["neuron_quant"]
+    supported_quantization: list[str] = ["neuron_quant", "fbgemm_fp8"]
     device_control_env_var: str = "NEURON_RT_VISIBLE_CORES"
 
     @classmethod
-- 
GitLab


From e56f44d9ec6f8f07f0d2c7936eea9bb2c0212bf2 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 27 May 2025 19:59:48 -0400
Subject: [PATCH 003/274] Support datasets in `vllm bench serve` and sync with
 benchmark_[serving,datasets].py (#18566)

---
 vllm/benchmarks/datasets.py              | 185 ++++++++++-
 vllm/benchmarks/endpoint_request_func.py | 226 +++++++++++++-
 vllm/benchmarks/serve.py                 | 382 +++++++++++++++++------
 3 files changed, 692 insertions(+), 101 deletions(-)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 74a9b2b03..712e83528 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -62,6 +62,7 @@ class SampleRequest:
 
 class BenchmarkDataset(ABC):
     DEFAULT_SEED = 0
+    IS_MULTIMODAL = False
 
     def __init__(
         self,
@@ -316,13 +317,15 @@ class RandomDataset(BenchmarkDataset):
         )
 
         vocab_size = tokenizer.vocab_size
+        num_special_tokens = tokenizer.num_special_tokens_to_add()
+        real_input_len = input_len - num_special_tokens
 
         prefix_token_ids = (np.random.randint(
             0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
 
         # New sampling logic: [X * (1 - b), X * (1 + b)]
-        input_low = int(input_len * (1 - range_ratio))
-        input_high = int(input_len * (1 + range_ratio))
+        input_low = int(real_input_len * (1 - range_ratio))
+        input_high = int(real_input_len * (1 + range_ratio))
         output_low = int(output_len * (1 - range_ratio))
         output_high = int(output_len * (1 + range_ratio))
 
@@ -345,6 +348,17 @@ class RandomDataset(BenchmarkDataset):
                          vocab_size).tolist()
             token_sequence = prefix_token_ids + inner_seq
             prompt = tokenizer.decode(token_sequence)
+            # After decoding the prompt we have to encode and decode it again.
+            # This is done because in some cases N consecutive tokens
+            # give a string tokenized into != N number of tokens.
+            # For example for GPT2Tokenizer:
+            # [6880, 6881] -> ['Ġcalls', 'here'] ->
+            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
+            # To avoid uncontrolled change of the prompt length,
+            # the encoded sequence is truncated before being decode again.
+            re_encoded_sequence = tokenizer.encode(
+                prompt, add_special_tokens=False)[:input_lens[i]]
+            prompt = tokenizer.decode(re_encoded_sequence)
             total_input_len = prefix_len + int(input_lens[i])
             requests.append(
                 SampleRequest(
@@ -637,6 +651,7 @@ class ConversationDataset(HuggingFaceDataset):
     SUPPORTED_DATASET_PATHS = {
         'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
     }
+    IS_MULTIMODAL = True
 
     def sample(self,
                tokenizer: PreTrainedTokenizerBase,
@@ -701,6 +716,7 @@ class VisionArenaDataset(HuggingFaceDataset):
         "lmarena-ai/vision-arena-bench-v0.1":
         lambda x: x["turns"][0][0]["content"]
     }
+    IS_MULTIMODAL = True
 
     def sample(
         self,
@@ -784,6 +800,64 @@ class InstructCoderDataset(HuggingFaceDataset):
         return sampled_requests
 
 
+# -----------------------------------------------------------------------------
+# MT-Bench Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class MTBenchDataset(HuggingFaceDataset):
+    """
+    MT-Bench Dataset.
+    https://huggingface.co/datasets/philschmid/mt-bench
+
+    We create a single turn dataset for MT-Bench.
+    This is similar to Spec decoding benchmark setup in vLLM
+    https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
+    """  # noqa: E501
+
+    DEFAULT_OUTPUT_LEN = 256  # avg len used in SD bench in vLLM
+    SUPPORTED_DATASET_PATHS = {
+        "philschmid/mt-bench",
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["turns"][0]
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{
+                    "role": "user",
+                    "content": prompt
+                }],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
 # -----------------------------------------------------------------------------
 # AIMO Dataset Implementation
 # -----------------------------------------------------------------------------
@@ -858,18 +932,18 @@ def _format_zeta_prompt(
         sample: dict,
         original_start_marker: str = "<|editable_region_start|>") -> dict:
     """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
-    
-    This function formats examples from the NEP dataset 
-    into prompts and expected outputs. It could be 
+
+    This function formats examples from the NEP dataset
+    into prompts and expected outputs. It could be
     further extended to support more NEP datasets.
-    
+
     Args:
-        sample: The dataset sample containing events, 
+        sample: The dataset sample containing events,
             inputs, and outputs.
-        original_start_marker: The marker indicating the 
-            start of the editable region. Defaults to 
+        original_start_marker: The marker indicating the
+            start of the editable region. Defaults to
             "<|editable_region_start|>".
-            
+
     Returns:
         A dictionary with the formatted prompts and expected outputs.
     """
@@ -919,3 +993,94 @@ class NextEditPredictionDataset(HuggingFaceDataset):
                 break
         self.maybe_oversample_requests(samples, num_requests)
         return samples
+
+
+# -----------------------------------------------------------------------------
+# ASR Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ASRDataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a ASR dataset for transcription.
+    Tested on the following set:
+
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | Dataset        | Domain                                 | Speaking Style           | hf-subset                   |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | TED-LIUM       | TED talks                              | Oratory                  | release1, release2, release3|
+    |                |                                        |                          | release3-speaker-adaptation |
+    | VoxPopuli      | European Parliament                    | Oratory                  | en, de, it, fr,  ...        |
+    | LibriSpeech    | Audiobook                              | Narrated                 | "LIUM/tedlium"              |
+    | GigaSpeech     | Audiobook, podcast, YouTube            | Narrated, spontaneous    | xs, s, m, l, xl, dev, test  |
+    | SPGISpeech     | Financial meetings                     | Oratory, spontaneous     | S, M, L, dev, test          |
+    | AMI            | Meetings                               | Spontaneous              | ihm, sdm                    |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+
+    """  # noqa: E501
+
+    SUPPORTED_DATASET_PATHS = {
+        "openslr/librispeech_asr",
+        "facebook/voxpopuli",
+        "LIUM/tedlium",
+        "edinburghcstr/ami",
+        "speechcolab/gigaspeech",
+        "kensho/spgispeech",
+    }
+
+    DEFAULT_OUTPUT_LEN = 128
+    IS_MULTIMODAL = True
+
+    # TODO Whisper-specific. Abstract interface when more models are supported.
+    TRANSCRIPTION_PREAMBLE = (
+        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>")
+    skip_long_audios: bool = True
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        **kwargs,
+    ) -> list:
+        try:
+            import librosa
+        except ImportError as e:
+            raise ImportError(
+                "librosa is required for ASRDataset. Please install it "
+                "using `pip install librosa`.") from e
+
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
+        prompt_len = len(tokenizer(prompt).input_ids)
+        sampled_requests = []
+        skipped = 0
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            audio = item["audio"]
+            y, sr = audio["array"], audio["sampling_rate"]
+            duration_s = librosa.get_duration(y=y, sr=sr)
+            # Whisper max supported duration
+            if self.skip_long_audios and duration_s > 30:
+                skipped += 1
+                continue
+
+            mm_content = {"audio": (y, sr)}
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                ))
+        if skipped:
+            logger.warning(
+                "%d samples discarded from dataset due to"
+                " their length being greater than"
+                " what Whisper supports.",
+                skipped,
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
diff --git a/vllm/benchmarks/endpoint_request_func.py b/vllm/benchmarks/endpoint_request_func.py
index 32767a896..a28630d50 100644
--- a/vllm/benchmarks/endpoint_request_func.py
+++ b/vllm/benchmarks/endpoint_request_func.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """The request function for API endpoints."""
 
+import io
 import json
 import os
 import sys
@@ -24,11 +25,11 @@ class RequestFuncInput:
     output_len: int
     model: str
     model_name: Optional[str] = None
-    best_of: int = 1
     logprobs: Optional[int] = None
     extra_body: Optional[dict] = None
     multi_modal_content: Optional[dict] = None
     ignore_eos: bool = False
+    language: Optional[str] = None
 
 
 @dataclass
@@ -71,7 +72,7 @@ async def async_request_openai_completions(
                 if request_func_input.model_name else request_func_input.model,
             "prompt": request_func_input.prompt,
             "temperature": 0.0,
-            "best_of": request_func_input.best_of,
+            "repetition_penalty": 1.0,
             "max_tokens": request_func_input.output_len,
             "logprobs": request_func_input.logprobs,
             "stream": True,
@@ -154,7 +155,226 @@ async def async_request_openai_completions(
     return output
 
 
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("chat/completions", "profile")), (
+        "OpenAI Chat Completions API URL must end with 'chat/completions'.")
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        if request_func_input.multi_modal_content:
+            content.append(request_func_input.multi_modal_content)
+        payload = {
+            "model":
+            request_func_input.model_name
+            if request_func_input.model_name else request_func_input.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": content
+                },
+            ],
+            "temperature":
+            0.0,
+            "max_completion_tokens":
+            request_func_input.output_len,
+            "stream":
+            True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
+                        if chunk != "[DONE]":
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            if choices := data.get("choices"):
+                                content = choices[0]["delta"].get("content")
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = timestamp - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                generated_text += content or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+
+                            most_recent_timestamp = timestamp
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_audio(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    # Lazy import without PlaceholderModule to avoid vllm dep.
+    import soundfile
+
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("transcriptions", "translations")), (
+        "OpenAI Chat Completions API URL must end with 'transcriptions' ")
+    "or `translations`."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        payload = {
+            "model":
+            request_func_input.model_name
+            if request_func_input.model_name else request_func_input.model,
+            "temperature":
+            0.0,
+            "max_completion_tokens":
+            request_func_input.output_len,
+            "stream":
+            True,
+            "language":
+            "en",
+            # Flattened due to multipart/form-data
+            "stream_include_usage":
+            True,
+            "stream_continuous_usage_stats":
+            True,
+        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        # Send audio file
+        def to_bytes(y, sr):
+            buffer = io.BytesIO()
+            soundfile.write(buffer, y, sr, format="WAV")
+            buffer.seek(0)
+            return buffer
+
+        with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
+            form = aiohttp.FormData()
+            form.add_field("file", f, content_type="audio/wav")
+            for key, value in payload.items():
+                form.add_field(key, str(value))
+
+            output = RequestFuncOutput()
+            output.prompt_len = request_func_input.prompt_len
+
+            generated_text = ""
+            ttft = 0.0
+            st = time.perf_counter()
+            most_recent_timestamp = st
+            try:
+                async with session.post(url=api_url,
+                                        data=form,
+                                        headers=headers) as response:
+                    if response.status == 200:
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+
+                            chunk = chunk_bytes.decode("utf-8").removeprefix(
+                                "data: ")
+                            if chunk != "[DONE]":
+                                timestamp = time.perf_counter()
+                                data = json.loads(chunk)
+
+                                if choices := data.get("choices"):
+                                    content = choices[0]["delta"].get(
+                                        "content")
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp)
+
+                                    generated_text += content or ""
+                                elif usage := data.get("usage"):
+                                    output.output_tokens = usage.get(
+                                        "completion_tokens")
+
+                                most_recent_timestamp = timestamp
+
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = most_recent_timestamp - st
+                    else:
+                        output.error = response.reason or ""
+                        output.success = False
+            except Exception:
+                output.success = False
+                exc_info = sys.exc_info()
+                output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
 # TODO: Add more request functions for different API protocols.
 ASYNC_REQUEST_FUNCS = {
-    "openai-comp": async_request_openai_completions,
+    "vllm": async_request_openai_completions,
+    "openai": async_request_openai_completions,
+    "openai-chat": async_request_openai_chat_completions,
+    "openai-audio": async_request_openai_audio,
 }
+
+OPENAI_COMPATIBLE_BACKENDS = [
+    k for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions,
+             async_request_openai_chat_completions)
+]
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index dc0ec3219..040815e87 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -7,7 +7,7 @@ to launch the vLLM OpenAI API server:
 
 On the client side, run:
     vllm bench serve \
-        --endpoint-type <endpoint_type. Default 'openi-comp'> \
+        --endpoint-type <endpoint_type. Default 'openai'> \
         --label <benchmark result label. Default using endpoint_type> \
         --model <your_model> \
         --dataset-name <dataset_name. Default 'random'> \
@@ -22,7 +22,7 @@ import os
 import random
 import time
 import warnings
-from collections.abc import AsyncGenerator
+from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, Optional
@@ -31,7 +31,14 @@ import numpy as np
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 
+from vllm.benchmarks.datasets import (AIMODataset, ASRDataset, BurstGPTDataset,
+                                      ConversationDataset, HuggingFaceDataset,
+                                      InstructCoderDataset, MTBenchDataset,
+                                      NextEditPredictionDataset, RandomDataset,
+                                      SampleRequest, ShareGPTDataset,
+                                      SonnetDataset, VisionArenaDataset)
 from vllm.benchmarks.endpoint_request_func import (ASYNC_REQUEST_FUNCS,
+                                                   OPENAI_COMPATIBLE_BACKENDS,
                                                    RequestFuncInput,
                                                    RequestFuncOutput)
 from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
@@ -71,53 +78,18 @@ class BenchmarkMetrics:
     percentiles_e2el_ms: list[tuple[float, float]]
 
 
-def sample_random_requests(
-    prefix_len: int,
-    input_len: int,
-    output_len: int,
-    num_prompts: int,
-    range_ratio: float,
-    tokenizer: PreTrainedTokenizerBase,
-) -> list[tuple[str, int, int]]:
-    prefix_token_ids = np.random.randint(0,
-                                         tokenizer.vocab_size,
-                                         size=prefix_len).tolist()
-
-    input_lens = np.random.randint(
-        int(input_len * range_ratio),
-        input_len + 1,
-        size=num_prompts,
-    )
-    output_lens = np.random.randint(
-        int(output_len * range_ratio),
-        output_len + 1,
-        size=num_prompts,
-    )
-    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
-    input_requests = []
-    for i in range(num_prompts):
-        prompt = tokenizer.decode(prefix_token_ids +
-                                  [(offsets[i] + i + j) % tokenizer.vocab_size
-                                   for j in range(input_lens[i])])
-
-        input_requests.append((prompt, int(prefix_len + input_lens[i]),
-                               int(output_lens[i]), None))
-
-    return input_requests
-
-
 async def get_request(
-    input_requests: list[tuple[str, int, int]],
+    input_requests: list[SampleRequest],
     request_rate: float,
     burstiness: float = 1.0,
-) -> AsyncGenerator[tuple[str, int, int], None]:
+) -> AsyncGenerator[SampleRequest, None]:
     """
     Asynchronously generates requests at a specified rate
     with OPTIONAL burstiness.
 
     Args:
         input_requests:
-            A list of input requests, each represented as a tuple.
+            A list of input requests, each represented as a SampleRequest.
         request_rate:
             The rate at which requests are generated (requests/s).
         burstiness (optional):
@@ -129,7 +101,7 @@ async def get_request(
             in more bursty requests, while a higher burstiness value
             (burstiness > 1) results in a more uniform arrival of requests.
     """
-    input_requests = iter(input_requests)
+    input_requests: Iterable[SampleRequest] = iter(input_requests)
 
     # Calculate scale parameter theta to maintain the desired request_rate.
     assert burstiness > 0, (
@@ -151,7 +123,7 @@ async def get_request(
 
 
 def calculate_metrics(
-    input_requests: list[tuple[str, int, int]],
+    input_requests: list[SampleRequest],
     outputs: list[RequestFuncOutput],
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
@@ -184,7 +156,7 @@ def calculate_metrics(
         if outputs[i].success:
             output_len = outputs[i].output_tokens
 
-            if output_len is None:
+            if not output_len:
                 # We use the tokenizer to count the number of output tokens
                 # for some serving backends instead of looking at
                 # len(outputs[i].itl) since multiple output tokens may be
@@ -194,7 +166,7 @@ def calculate_metrics(
                     tokenizer(outputs[i].generated_text,
                               add_special_tokens=False).input_ids)
             actual_output_lens.append(output_len)
-            total_input += input_requests[i][1]
+            total_input += input_requests[i].prompt_len
             tpot = 0
             if output_len > 1:
                 latency_minus_ttft = outputs[i].latency - outputs[i].ttft
@@ -277,19 +249,19 @@ async def benchmark(
     model_id: str,
     model_name: str,
     tokenizer: PreTrainedTokenizerBase,
-    input_requests: list[tuple[str, int, int]],
+    input_requests: list[SampleRequest],
     logprobs: Optional[int],
-    best_of: int,
     request_rate: float,
     burstiness: float,
     disable_tqdm: bool,
     profile: bool,
     selected_percentile_metrics: list[str],
-    selected_percentiles: list[str],
+    selected_percentiles: list[float],
     ignore_eos: bool,
     goodput_config_dict: dict[str, float],
     max_concurrency: Optional[int],
-    lora_modules: Optional[list[str]],
+    lora_modules: Optional[Iterable[str]],
+    extra_body: Optional[dict],
 ):
     if endpoint_type in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
@@ -298,11 +270,13 @@ async def benchmark(
 
     print("Starting initial single prompt test run...")
     test_prompt, test_prompt_len, test_output_len, test_mm_content = (
-        input_requests[0])
-    if endpoint_type != "openai-chat" and test_mm_content is not None:
-        # multi-modal benchmark is only available on OpenAI Chat endpoint.
-        raise ValueError("Multi-modal content is only supported on "
-                         "'openai-chat' endpoint_type.")
+        input_requests[0].prompt,
+        input_requests[0].prompt_len,
+        input_requests[0].expected_output_len,
+        input_requests[0].multi_modal_data,
+    )
+
+    assert test_mm_content is None or isinstance(test_mm_content, dict)
     test_input = RequestFuncInput(
         model=model_id,
         model_name=model_name,
@@ -311,9 +285,9 @@ async def benchmark(
         prompt_len=test_prompt_len,
         output_len=test_output_len,
         logprobs=logprobs,
-        best_of=best_of,
         multi_modal_content=test_mm_content,
         ignore_eos=ignore_eos,
+        extra_body=extra_body,
     )
 
     test_output = await request_func(request_func_input=test_input)
@@ -338,9 +312,9 @@ async def benchmark(
                                          prompt_len=test_prompt_len,
                                          output_len=test_output_len,
                                          logprobs=logprobs,
-                                         best_of=best_of,
                                          multi_modal_content=test_mm_content,
-                                         ignore_eos=ignore_eos)
+                                         ignore_eos=ignore_eos,
+                                         extra_body=extra_body)
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
             print("Profiler started")
@@ -374,7 +348,12 @@ async def benchmark(
     benchmark_start_time = time.perf_counter()
     tasks: list[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate, burstiness):
-        prompt, prompt_len, output_len, mm_content = request
+        prompt, prompt_len, output_len, mm_content = (
+            request.prompt,
+            request.prompt_len,
+            request.expected_output_len,
+            request.multi_modal_data,
+        )
         req_model_id, req_model_name = model_id, model_name
         if lora_modules:
             req_lora_module = next(lora_modules)
@@ -387,9 +366,9 @@ async def benchmark(
                                               prompt_len=prompt_len,
                                               output_len=output_len,
                                               logprobs=logprobs,
-                                              best_of=best_of,
                                               multi_modal_content=mm_content,
-                                              ignore_eos=ignore_eos)
+                                              ignore_eos=ignore_eos,
+                                              extra_body=extra_body)
         tasks.append(
             asyncio.create_task(
                 limited_request_func(request_func_input=request_func_input,
@@ -405,7 +384,6 @@ async def benchmark(
             prompt_len=test_prompt_len,
             output_len=test_output_len,
             logprobs=logprobs,
-            best_of=best_of,
         )
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
@@ -567,7 +545,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--endpoint-type",
         type=str,
-        default="openai-comp",
+        default="openai",
         choices=list(ASYNC_REQUEST_FUNCS.keys()),
     )
     parser.add_argument(
@@ -596,9 +574,16 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "--dataset-name",
         type=str,
         default="random",
-        choices=["random"],
+        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
         help="Name of the dataset to benchmark on.",
     )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default=None,
+        help="Path to the sharegpt/sonnet dataset. "
+        "Or the huggingface dataset ID if using HF dataset.",
+    )
     parser.add_argument(
         "--max-concurrency",
         type=int,
@@ -624,13 +609,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help=
         "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
     )
-    parser.add_argument(
-        "--best-of",
-        type=int,
-        default=1,
-        help="Generates `best_of` sequences per prompt and "
-        "returns the best one.",
-    )
     parser.add_argument("--use-beam-search", action="store_true")
     parser.add_argument(
         "--num-prompts",
@@ -691,6 +669,17 @@ def add_cli_args(parser: argparse.ArgumentParser):
         action="store_true",
         help="Specify to save benchmark results to a json file",
     )
+    parser.add_argument(
+        "--save-detailed",
+        action="store_true",
+        help="When saving the results, whether to include per request "
+        "information such as response, error, ttfs, tpots, etc.",
+    )
+    parser.add_argument(
+        "--append-result",
+        action="store_true",
+        help="Append the benchmark result to the existing json file.",
+    )
     parser.add_argument(
         "--metadata",
         metavar="KEY=VALUE",
@@ -733,6 +722,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         default="99",
         help="Comma-separated list of percentiles for selected metrics. "
         "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
+        "Default value is \"99\"."
         "Use \"--percentile-metrics\" to select metrics.",
     )
     parser.add_argument(
@@ -745,7 +735,41 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "separated by spaces. Allowed request level metric names are "
         "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
         "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
-        "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
+    )
+
+    # group for dataset specific arguments
+    sonnet_group = parser.add_argument_group("sonnet dataset options")
+    sonnet_group.add_argument(
+        "--sonnet-input-len",
+        type=int,
+        default=550,
+        help=
+        "Number of input tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-output-len",
+        type=int,
+        default=150,
+        help=
+        "Number of output tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-prefix-len",
+        type=int,
+        default=200,
+        help=
+        "Number of prefix tokens per request, used only for sonnet dataset.",
+    )
+
+    sharegpt_group = parser.add_argument_group("sharegpt dataset options")
+    sharegpt_group.add_argument(
+        "--sharegpt-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output length "
+        "from the ShareGPT dataset.",
+    )
 
     random_group = parser.add_argument_group("random dataset options")
     random_group.add_argument(
@@ -765,9 +789,11 @@ def add_cli_args(parser: argparse.ArgumentParser):
     random_group.add_argument(
         "--random-range-ratio",
         type=float,
-        default=1.0,
-        help="Range of sampled ratio of input/output length, "
-        "used only for random sampling.",
+        default=0.0,
+        help="Range ratio for sampling input/output length, "
+        "used only for random sampling. Must be in the range [0, 1) to define "
+        "a symmetric sampling range"
+        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
     )
     random_group.add_argument(
         "--random-prefix-len",
@@ -778,6 +804,54 @@ def add_cli_args(parser: argparse.ArgumentParser):
         " request is [random-prefix-len, "
         " random-prefix-len + random-prefix-len * random-range-ratio).")
 
+    hf_group = parser.add_argument_group("hf dataset options")
+    hf_group.add_argument("--hf-subset",
+                          type=str,
+                          default=None,
+                          help="Subset of the HF dataset.")
+    hf_group.add_argument("--hf-split",
+                          type=str,
+                          default=None,
+                          help="Split of the HF dataset.")
+    hf_group.add_argument(
+        "--hf-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output lengths "
+        "from the sampled HF dataset.",
+    )
+
+    sampling_group = parser.add_argument_group("sampling parameters")
+    sampling_group.add_argument(
+        "--top-p",
+        type=float,
+        default=None,
+        help="Top-p sampling parameter. Only has effect on "
+        "openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--top-k",
+        type=int,
+        default=None,
+        help="Top-k sampling parameter. Only has effect on "
+        "openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--min-p",
+        type=float,
+        default=None,
+        help="Min-p sampling parameter. Only has effect on "
+        "openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--temperature",
+        type=float,
+        default=None,
+        help="Temperature sampling parameter. Only has effect on "
+        "openai-compatible backends. If not specified, default to greedy "
+        "decoding (i.e. temperature==0.0).",
+    )
+
     parser.add_argument(
         '--tokenizer-mode',
         type=str,
@@ -826,27 +900,142 @@ def main(args: argparse.Namespace):
     tokenizer = get_tokenizer(tokenizer_id,
                               tokenizer_mode=tokenizer_mode,
                               trust_remote_code=args.trust_remote_code)
-    # TODO: This should be refactored to use the benchmark_dataset.py
-    # in later PRs.
+
     if args.dataset_name is None:
         raise ValueError(
             "Please specify '--dataset-name' and the corresponding "
             "'--dataset-path' if required.")
-    elif args.dataset_name == "random":
-        input_requests = sample_random_requests(
-            prefix_len=args.random_prefix_len,
-            input_len=args.random_input_len,
-            output_len=args.random_output_len,
-            num_prompts=args.num_prompts,
-            range_ratio=args.random_range_ratio,
+
+    if args.dataset_name == "sonnet":
+        dataset = SonnetDataset(dataset_path=args.dataset_path)
+        # For the "sonnet" dataset, formatting depends on the backend.
+        if args.backend == "openai-chat":
+            input_requests = dataset.sample(
+                num_requests=args.num_prompts,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+                return_prompt_formatted=False,
+            )
+        else:
+            assert tokenizer.chat_template or tokenizer.default_chat_template, (
+                "Tokenizer/model must have chat template for sonnet dataset.")
+            input_requests = dataset.sample(
+                num_requests=args.num_prompts,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+                return_prompt_formatted=True,
+            )
+
+    elif args.dataset_name == "hf":
+        # all following datasets are implemented from the
+        # HuggingFaceDataset base class
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = VisionArenaDataset
+            args.hf_split = "train"
+            args.hf_subset = None
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = InstructCoderDataset
+            args.hf_split = "train"
+        elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = MTBenchDataset
+            args.hf_split = "train"
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = ConversationDataset
+            args.hf_split = "train"
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = AIMODataset
+            args.hf_split = "train"
+        elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS:  # noqa: E501
+            dataset_class = NextEditPredictionDataset
+            args.hf_split = "train"
+        elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = ASRDataset
+            args.hf_split = "train"
+        else:
+            supported_datasets = set([
+                dataset_name for cls in HuggingFaceDataset.__subclasses__()
+                for dataset_name in cls.SUPPORTED_DATASET_PATHS
+            ])
+            raise ValueError(
+                f"Unsupported dataset path: {args.dataset_path}. "
+                "Huggingface dataset only supports dataset_path"
+                f" from one of following: {supported_datasets}. "
+                "Please consider contributing if you would "
+                "like to add support for additional dataset formats.")
+
+        if dataset_class.IS_MULTIMODAL and endpoint_type not in [
+                "openai-chat",
+                "openai-audio",
+        ]:
+            # multi-modal benchmark is only available on OpenAI Chat backend.
+            raise ValueError(
+                "Multi-modal content is only supported on 'openai-chat' and "
+                "'openai-audio' backend.")
+        input_requests = dataset_class(
+            dataset_path=args.dataset_path,
+            dataset_subset=args.hf_subset,
+            dataset_split=args.hf_split,
+            random_seed=args.seed,
+        ).sample(
+            num_requests=args.num_prompts,
             tokenizer=tokenizer,
+            output_len=args.hf_output_len,
         )
 
     else:
-        raise ValueError(f"Unknown dataset: {args.dataset_name}")
-
+        # For datasets that follow a similar structure, use a mapping.
+        dataset_mapping = {
+            "sharegpt":
+            lambda: ShareGPTDataset(random_seed=args.seed,
+                                    dataset_path=args.dataset_path).sample(
+                                        tokenizer=tokenizer,
+                                        num_requests=args.num_prompts,
+                                        output_len=args.sharegpt_output_len,
+                                    ),
+            "burstgpt":
+            lambda: BurstGPTDataset(random_seed=args.seed,
+                                    dataset_path=args.dataset_path).
+            sample(tokenizer=tokenizer, num_requests=args.num_prompts),
+            "random":
+            lambda: RandomDataset(dataset_path=args.dataset_path).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                prefix_len=args.random_prefix_len,
+                input_len=args.random_input_len,
+                output_len=args.random_output_len,
+                range_ratio=args.random_range_ratio,
+            ),
+        }
+
+        try:
+            input_requests = dataset_mapping[args.dataset_name]()
+        except KeyError as err:
+            raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
     goodput_config_dict = check_goodput_args(args)
 
+    # Collect the sampling parameters.
+    sampling_params = {
+        k: v
+        for k, v in {
+            "top_p": args.top_p,
+            "top_k": args.top_k,
+            "min_p": args.min_p,
+            "temperature": args.temperature,
+        }.items() if v is not None
+    }
+
+    # Sampling parameters are only supported by openai-compatible backend.
+    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
+        raise ValueError("Sampling parameters are only supported by "
+                         "openai-compatible backends.")
+
+    if "temperature" not in sampling_params:
+        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
+
     # Avoid GC processing "static" data - reduce pause times.
     gc.collect()
     gc.freeze()
@@ -861,7 +1050,6 @@ def main(args: argparse.Namespace):
             tokenizer=tokenizer,
             input_requests=input_requests,
             logprobs=args.logprobs,
-            best_of=args.best_of,
             request_rate=args.request_rate,
             burstiness=args.burstiness,
             disable_tqdm=args.disable_tqdm,
@@ -874,10 +1062,11 @@ def main(args: argparse.Namespace):
             goodput_config_dict=goodput_config_dict,
             max_concurrency=args.max_concurrency,
             lora_modules=args.lora_modules,
+            extra_body=sampling_params,
         ))
 
     # Save config and results to json
-    if args.save_result:
+    if args.save_result or args.append_result:
         result_json: dict[str, Any] = {}
 
         # Setup
@@ -887,7 +1076,6 @@ def main(args: argparse.Namespace):
         result_json["label"] = label
         result_json["model_id"] = model_id
         result_json["tokenizer_id"] = tokenizer_id
-        result_json["best_of"] = args.best_of
         result_json["num_prompts"] = args.num_prompts
 
         # Metadata
@@ -910,6 +1098,19 @@ def main(args: argparse.Namespace):
         # Merge with benchmark result
         result_json = {**result_json, **benchmark_result}
 
+        if not args.save_detailed:
+            # Remove fields with too many data points
+            for field in [
+                    "input_lens",
+                    "output_lens",
+                    "ttfts",
+                    "itls",
+                    "generated_texts",
+                    "errors",
+            ]:
+                if field in result_json:
+                    del result_json[field]
+
         # Save to file
         base_model_id = model_id.split("/")[-1]
         max_concurrency_str = (f"-concurrency{args.max_concurrency}"
@@ -920,6 +1121,11 @@ def main(args: argparse.Namespace):
             file_name = args.result_filename
         if args.result_dir:
             file_name = os.path.join(args.result_dir, file_name)
-        with open(file_name, "w", encoding='utf-8') as outfile:
+        with open(file_name,
+                  mode="a+" if args.append_result else "w",
+                  encoding="utf-8") as outfile:
+            # Append a newline.
+            if args.append_result and outfile.tell() != 0:
+                outfile.write("\n")
             json.dump(result_json, outfile)
         save_to_pytorch_benchmark_format(args, result_json, file_name)
-- 
GitLab


From 51e98e4ffd696289bce1ec92c78cc7298dc1600b Mon Sep 17 00:00:00 2001
From: cascade <cascade812@outlook.com>
Date: Tue, 27 May 2025 17:18:09 -0700
Subject: [PATCH 004/274] [Bugfix] Disable prefix caching by default for
 benchmark (#18771)

Signed-off-by: cascade812 <cascade812@outlook.com>
---
 vllm/benchmarks/latency.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index 2c992727b..0dd938e75 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -82,7 +82,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser = EngineArgs.add_cli_args(parser)
     # V1 enables prefix caching by default which skews the latency
     # numbers. We need to disable prefix caching by default.
-    parser.set_defaults(enable_prefix_caching=True)
+    parser.set_defaults(enable_prefix_caching=False)
 
 
 def main(args: argparse.Namespace):
-- 
GitLab


From a3896c7f0216d6930a912924e9149f31087232c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Tue, 27 May 2025 20:49:24 -0400
Subject: [PATCH 005/274] [Build] Fixes for CMake install (#18570)

---
 CMakeLists.txt                                |  5 +++++
 cmake/external_projects/vllm_flash_attn.cmake | 20 +++++++++++++++++--
 cmake/utils.cmake                             |  2 +-
 setup.py                                      |  5 +----
 4 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c5856fc5..6536e9a57 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,6 +23,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 # Suppress potential warnings about unused manually-specified variables
 set(ignoreMe "${VLLM_PYTHON_PATH}")
 
+# Prevent installation of dependencies (cutlass) by default.
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
+
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
@@ -785,5 +788,7 @@ endif()
 # For CUDA we also build and ship some external projects.
 if (VLLM_GPU_LANG STREQUAL "CUDA")
     include(cmake/external_projects/flashmla.cmake)
+
+    # vllm-flash-attn should be last as it overwrites some CMake functions
     include(cmake/external_projects/vllm_flash_attn.cmake)
 endif ()
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index b04e4c2d0..a4edd5b96 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -46,22 +46,38 @@ else()
 endif()
 
 
+# Ensure the vllm/vllm_flash_attn directory exists before installation
+install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS)
+
+# Make sure vllm-flash-attn install rules are nested under vllm/
+# This is here to support installing all components under the same prefix with cmake --install.
+# setup.py installs every component separately but uses the same prefix for all.
+# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3,
+# and these statements don't hurt when installing neither component.
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
+install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS)
+
 # Fetch the vllm-flash-attn library
 FetchContent_MakeAvailable(vllm-flash-attn)
 message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
 
+# Restore the install prefix
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
+
 # Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
 # case only one is built, in the case both are built redundant work is done)
 install(
   DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
-  DESTINATION vllm_flash_attn
+  DESTINATION vllm/vllm_flash_attn
   COMPONENT _vllm_fa2_C
   FILES_MATCHING PATTERN "*.py"
 )
 
 install(
   DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
-  DESTINATION vllm_flash_attn
+  DESTINATION vllm/vllm_flash_attn
   COMPONENT _vllm_fa3_C
   FILES_MATCHING PATTERN "*.py"
 )
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 12e4e3902..6d90555f2 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -76,7 +76,7 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
   set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
   add_custom_target(
     hipify${NAME}
-    COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
+    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
     DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
     BYPRODUCTS ${HIP_SRCS}
     COMMENT "Running hipify on ${NAME} extension source files.")
diff --git a/setup.py b/setup.py
index 180f2f978..b822a4ec3 100644
--- a/setup.py
+++ b/setup.py
@@ -251,11 +251,8 @@ class cmake_build_ext(build_ext):
 
             # CMake appends the extension prefix to the install path,
             # and outdir already contains that prefix, so we need to remove it.
-            # We assume only the final component of extension prefix is added by
-            # CMake, this is currently true for current extensions but may not
-            # always be the case.
             prefix = outdir
-            if '.' in ext.name:
+            for _ in range(ext.name.count('.')):
                 prefix = prefix.parent
 
             # prefix here should actually be the same for all components
-- 
GitLab


From d73a9457a56e99d33676f668611bfb232357f1b9 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Wed, 28 May 2025 02:46:21 +0100
Subject: [PATCH 006/274] [Core] Improve Tensor serialisation (#18774)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 vllm/v1/serial_utils.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 0dcf02113..fbd38fc47 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -158,10 +158,8 @@ class MsgpackEncoder:
         self, obj: torch.Tensor
     ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
         assert self.aux_buffers is not None
-        # this creates a copy of the tensor if it's not already contiguous
-        obj = obj.contiguous()
         #  view the tensor as a 1D array of bytes
-        arr = obj.view((obj.numel(), )).view(torch.uint8).numpy()
+        arr = obj.flatten().view(torch.uint8).numpy()
         if obj.nbytes < self.size_threshold:
             # Smaller tensors are encoded inline, just like ndarrays.
             data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)
@@ -169,7 +167,7 @@ class MsgpackEncoder:
             # Otherwise encode index of backing buffer to avoid copy.
             data = len(self.aux_buffers)
             self.aux_buffers.append(arr.data)
-        dtype = str(obj.dtype)[6:]  # remove 'torch.' prefix
+        dtype = str(obj.dtype).removeprefix("torch.")
         return dtype, obj.shape, data
 
     def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
@@ -245,7 +243,7 @@ class MsgpackDecoder:
         # zero-copy decode. We assume the ndarray will not be kept around,
         # as it now locks the whole received message buffer in memory.
         buffer = self.aux_buffers[data] if isinstance(data, int) else data
-        return np.ndarray(buffer=buffer, dtype=np.dtype(dtype), shape=shape)
+        return np.frombuffer(buffer, dtype=dtype).reshape(shape)
 
     def _decode_tensor(self, arr: Any) -> torch.Tensor:
         dtype, shape, data = arr
@@ -254,12 +252,15 @@ class MsgpackDecoder:
         # not complain about a readonly memoryview.
         buffer = self.aux_buffers[data] if isinstance(data, int) \
             else bytearray(data)
-        # Create numpy wrapper around the bytes
-        arr = np.ndarray(buffer=buffer, dtype=np.uint8, shape=(len(buffer), ))
         torch_dtype = getattr(torch, dtype)
         assert isinstance(torch_dtype, torch.dtype)
+        if not buffer:  # torch.frombuffer doesn't like empty buffers
+            assert 0 in shape
+            return torch.empty(shape, dtype=torch_dtype)
+        # Create uint8 array
+        arr = torch.frombuffer(buffer, dtype=torch.uint8)
         # Convert back to proper shape & type
-        return torch.from_numpy(arr).view(torch_dtype).view(shape)
+        return arr.view(torch_dtype).view(shape)
 
     def _decode_mm_items(self, obj: list) -> list[MultiModalKwargsItem]:
         decoded_items = []
-- 
GitLab


From 794ae1f551e4152a258f88c0c914ce5ea130bd05 Mon Sep 17 00:00:00 2001
From: fxmarty-amd <felmarty@amd.com>
Date: Wed, 28 May 2025 04:45:41 +0200
Subject: [PATCH 007/274] [rocm] Fix wrong attention log (#18764)

Signed-off-by: Felix Marty <felmarty@amd.com>
---
 vllm/platforms/rocm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index e1dcd9870..b5e742c65 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -194,8 +194,9 @@ class RocmPlatform(Platform):
                     f" The selected backend, {selected_backend.name},"
                     f"is not MLA type while requested for MLA backend.")
 
-        selected_backend = (_Backend.ROCM_FLASH if selected_backend
-                            == _Backend.FLASH_ATTN else selected_backend)
+        if selected_backend is None or selected_backend == _Backend.FLASH_ATTN:
+            selected_backend = _Backend.ROCM_FLASH
+
         if envs.VLLM_USE_V1:
             logger.info("Using Triton Attention backend on V1 engine.")
             return ("vllm.v1.attention.backends."
-- 
GitLab


From 3e9ce609bd0636301b3dd95deeea4b7b6df3f654 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Wed, 28 May 2025 11:29:53 +0800
Subject: [PATCH 008/274] [Bugfix] Fix nomic max_model_len (#18755)

---
 .../offline_inference/context_extension.py    |  46 +++++++
 .../pooling/test_nomic_max_model_len.py       | 130 ++++++++++++++++++
 vllm/config.py                                |  14 ++
 vllm/model_executor/models/bert_with_rope.py  |  55 +++++++-
 4 files changed, 242 insertions(+), 3 deletions(-)
 create mode 100644 examples/offline_inference/context_extension.py
 create mode 100644 tests/models/language/pooling/test_nomic_max_model_len.py

diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py
new file mode 100644
index 000000000..1a70446c3
--- /dev/null
+++ b/examples/offline_inference/context_extension.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+
+rope_theta = 1000000
+original_max_position_embeddings = 32768
+factor = 4.0
+
+# Use yarn to extend context
+hf_overrides = {
+    "rope_theta": rope_theta,
+    "rope_scaling": {
+        "rope_type": "yarn",
+        "factor": factor,
+        "original_max_position_embeddings": original_max_position_embeddings,
+    },
+    "max_model_len": int(original_max_position_embeddings * factor),
+}
+
+llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides)
+
+sampling_params = SamplingParams(
+    temperature=0.8,
+    top_p=0.95,
+    max_tokens=128,
+)
+
+conversation = [
+    {"role": "system", "content": "You are a helpful assistant"},
+    {"role": "user", "content": "Hello"},
+    {"role": "assistant", "content": "Hello! How can I assist you today?"},
+]
+outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
+
+
+def print_outputs(outputs):
+    print("\nGenerated Outputs:\n" + "-" * 80)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\n")
+        print(f"Generated text: {generated_text!r}")
+        print("-" * 80)
+
+
+print_outputs(outputs)
diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py
new file mode 100644
index 000000000..68603e628
--- /dev/null
+++ b/tests/models/language/pooling/test_nomic_max_model_len.py
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: SIM117
+import pytest
+
+from ...utils import EmbedModelInfo
+
+MODELS = [
+    EmbedModelInfo("nomic-ai/nomic-embed-text-v1"),
+    #EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
+    #EmbedModelInfo("nomic-ai/CodeRankEmbed"),
+    EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"),
+    #EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
+]
+
+rope_theta = 1000
+factor = 4.0
+original_max_position_embeddings = 2048
+max_model_len = int(original_max_position_embeddings * factor)
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_default(model_info, vllm_runner):
+    with vllm_runner(model_info.name, task="embed",
+                     max_model_len=None) as vllm_model:
+        model_config = vllm_model.model.llm_engine.model_config
+        if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
+            # For nomic-embed-text-v2-moe the length is set to 512
+            # by sentence_bert_config.json.
+            assert model_config.max_model_len == 512
+        else:
+            assert (
+                model_config.max_model_len == original_max_position_embeddings)
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_set_max_model_len_legal(model_info, vllm_runner):
+    # set max_model_len <= 512
+    with vllm_runner(model_info.name, task="embed",
+                     max_model_len=256) as vllm_model:
+        model_config = vllm_model.model.llm_engine.model_config
+        assert model_config.max_model_len == 256
+
+    # set 512 < max_model_len <= 2048
+    if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
+        # For nomic-embed-text-v2-moe the length is set to 512
+        # by sentence_bert_config.json.
+        with pytest.raises(ValueError):
+            with vllm_runner(model_info.name, task="embed",
+                             max_model_len=1024):
+                pass
+    else:
+        with vllm_runner(model_info.name, task="embed",
+                         max_model_len=1024) as vllm_model:
+            model_config = vllm_model.model.llm_engine.model_config
+            assert model_config.max_model_len == 1024
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_set_max_model_len_illegal(model_info, vllm_runner):
+    # set max_model_len > 2048
+    with pytest.raises(ValueError):
+        with vllm_runner(model_info.name, task="embed", max_model_len=4096):
+            pass
+
+    # set max_model_len > 2048 by hf_overrides
+    hf_overrides = {"max_model_len": 4096}
+    with pytest.raises(ValueError):
+        with vllm_runner(model_info.name,
+                         task="embed",
+                         max_model_len=None,
+                         hf_overrides=hf_overrides):
+            pass
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_use_rope_scaling_legal(model_info, vllm_runner):
+    hf_overrides = {
+        "rope_theta": rope_theta,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": factor,
+            "original_max_position_embeddings":
+            original_max_position_embeddings
+        },
+        "max_model_len": max_model_len
+    }
+
+    with vllm_runner(model_info.name,
+                     task="embed",
+                     max_model_len=None,
+                     hf_overrides=hf_overrides):
+        pass
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_use_rope_scaling_illegal(model_info, vllm_runner):
+    hf_overrides = {
+        "rope_theta": rope_theta,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": factor,
+            "original_max_position_embeddings":
+            original_max_position_embeddings
+        }
+    }
+    # illegal max_model_len
+    with pytest.raises(ValueError):
+        with vllm_runner(model_info.name,
+                         task="embed",
+                         max_model_len=max_model_len + 1,
+                         hf_overrides=hf_overrides):
+            pass
+
+    hf_overrides = {
+        "rope_theta": rope_theta,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": factor,
+            "original_max_position_embeddings":
+            original_max_position_embeddings
+        },
+        "max_model_len": max_model_len + 1
+    }
+    # illegal max_model_len by hf_overrides
+    with pytest.raises(ValueError):
+        with vllm_runner(model_info.name,
+                         task="embed",
+                         max_model_len=None,
+                         hf_overrides=hf_overrides):
+            pass
diff --git a/vllm/config.py b/vllm/config.py
index db35c848b..4afdda3cc 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -571,6 +571,7 @@ class ModelConfig:
 
                 sliding_window = None
 
+        self.original_max_model_len = self.max_model_len
         self.max_model_len = _get_and_verify_max_len(
             hf_config=self.hf_text_config,
             max_model_len=self.max_model_len,
@@ -4471,6 +4472,19 @@ class VllmConfig:
         self.compilation_config.init_with_cudagraph_sizes(
             batch_size_capture_list)
 
+    def recalculate_max_model_len(self, max_model_len: int):
+        model_config = self.model_config
+        max_model_len = _get_and_verify_max_len(
+            hf_config=model_config.hf_text_config,
+            max_model_len=max_model_len,
+            disable_sliding_window=model_config.disable_sliding_window,
+            sliding_window_len=model_config.get_hf_config_sliding_window(),
+            spec_target_max_model_len=model_config.spec_target_max_model_len,
+            encoder_config=model_config.encoder_config)
+        self.model_config.max_model_len = max_model_len
+        self.scheduler_config.max_model_len = max_model_len
+        self.compute_hash()
+
     def __str__(self):
         return (
             f"model={self.model_config.model!r},"
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index af6deb3bf..8a387d71f 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 from collections.abc import Iterable
+from copy import deepcopy
 from typing import Optional
 
 import torch
@@ -10,6 +11,7 @@ from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import (get_act_and_mul_fn,
                                                    get_act_fn)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -27,6 +29,8 @@ from vllm.model_executor.models.interfaces import SupportsQuant
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.sequence import IntermediateTensors
 
+logger = init_logger(__name__)
+
 
 class BertWithRopeEmbedding(nn.Module):
 
@@ -513,10 +517,11 @@ class NomicBertModel(BertWithRope):
 
         head_dim = config.hidden_size // config.num_attention_heads
         rotary_emb_dim = head_dim * config.rotary_emb_fraction
+        max_trained_positions = getattr(config, "max_trained_positions", 2048)
         config.rotary_kwargs = {
             "head_size": head_dim,
             "rotary_dim": rotary_emb_dim,
-            "max_position": config.max_trained_positions,
+            "max_position": max_trained_positions,
             "base": getattr(config, "rope_theta", config.rotary_emb_base),
             "rope_scaling": getattr(config, "rope_scaling", None)
         }
@@ -525,8 +530,52 @@ class NomicBertModel(BertWithRope):
         # than max_trained_positions 2048, the results are consistent
         # with SentenceTransformer.
         # The context extension uses vllm style rope_theta and rope_scaling.
-        # See #17785
-
+        # See #17785 #18755
+        if (not vllm_config.model_config.hf_overrides
+                and vllm_config.model_config.original_max_model_len is None):
+            # Default
+            # Reset max_model_len to max_trained_positions.
+            # nomic-embed-text-v2-moe the length is set to 512
+            # by sentence_bert_config.json.
+            max_model_len_before = vllm_config.model_config.max_model_len
+            max_model_len = min(vllm_config.model_config.max_model_len,
+                                max_trained_positions)
+
+            vllm_config.recalculate_max_model_len(max_model_len)
+            logger.warning(
+                "Nomic context extension is disabled. "
+                "Changing max_model_len from %s to %s. "
+                "To enable context extension, see: "
+                "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.html",
+                max_model_len_before, vllm_config.model_config.max_model_len)
+        else:
+            # We need to re-verify max_model_len to avoid lengths
+            # greater than position_embedding.
+            model_config = vllm_config.model_config
+            hf_text_config = model_config.hf_text_config
+
+            if isinstance(model_config.hf_overrides, dict):
+                # hf_overrides_kw
+                max_model_len = model_config.hf_overrides.get(
+                    "max_model_len", vllm_config.model_config.max_model_len)
+            else:
+                # hf_overrides_fn
+                # This might be overridden by sentence_bert_config.json.
+                max_model_len = vllm_config.model_config.max_model_len
+
+            # reset hf_text_config for recalculate_max_model_len.
+            if hasattr(hf_text_config, "max_model_len"):
+                delattr(hf_text_config, "max_model_len")
+            hf_text_config.max_position_embeddings = max_trained_positions
+            hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"]
+
+            # The priority of sentence_bert_config.json is higher
+            # than max_position_embeddings
+            encoder_config = deepcopy(model_config.encoder_config)
+            encoder_config.pop("max_seq_length", None)
+            model_config.encoder_config = encoder_config
+
+            vllm_config.recalculate_max_model_len(max_model_len)
         return config
 
 
-- 
GitLab


From 9a21e331ff8c4ab052a654b3ebd9f67ddfff4845 Mon Sep 17 00:00:00 2001
From: Guillaume Calmettes <gcalmettes@scaleway.com>
Date: Wed, 28 May 2025 05:35:43 +0200
Subject: [PATCH 009/274] [Bugfix]: correctly propagate errors message caught
 at the chat_templating step to the client (#18769)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
---
 vllm/entrypoints/chat_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index ec1b327da..b051cd333 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1252,7 +1252,7 @@ def apply_hf_chat_template(
         # investigation.
         logger.exception(
             "An error occurred in `transformers` while applying chat template")
-        raise ValueError from e
+        raise ValueError(str(e)) from e
 
 def apply_mistral_chat_template(
     tokenizer: MistralTokenizer,
@@ -1281,7 +1281,7 @@ def apply_mistral_chat_template(
     # We convert those assertion errors to ValueErrors so they can be
     # are properly caught in the preprocessing_input step
     except (AssertionError, MistralCommonException) as e:
-        raise ValueError from e
+        raise ValueError(str(e)) from e
 
     # External library exceptions can sometimes occur despite the framework's
     # internal exception management capabilities.
@@ -1292,7 +1292,7 @@ def apply_mistral_chat_template(
         logger.exception(
             "An error occurred in `mistral_common` while applying chat "
             "template")
-        raise ValueError from e
+        raise ValueError(str(e)) from e
 
 def random_tool_call_id() -> str:
     return f"chatcmpl-tool-{random_uuid()}"
-- 
GitLab


From 774c5fde307dcadb8e0d2a58bf2d429eb6cb648d Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Tue, 27 May 2025 23:16:30 -0500
Subject: [PATCH 010/274] [V1] fix torch profiling for V1 offline scenarios
 (#18445)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 benchmarks/benchmark_latency.py | 37 +++++++++------------------------
 vllm/benchmarks/latency.py      | 33 ++++++++---------------------
 vllm/v1/worker/gpu_worker.py    |  2 ++
 vllm/worker/worker.py           |  2 ++
 4 files changed, 23 insertions(+), 51 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 84759c5c3..de62bf5c6 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -6,13 +6,12 @@ import dataclasses
 import json
 import os
 import time
-from pathlib import Path
 from typing import Any, Optional
 
 import numpy as np
-import torch
 from tqdm import tqdm
 
+import vllm.envs as envs
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
@@ -80,17 +79,9 @@ def main(args: argparse.Namespace):
 
     def run_to_completion(profile_dir: Optional[str] = None):
         if profile_dir:
-            with torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                    torch.profiler.ProfilerActivity.CUDA,
-                ],
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    str(profile_dir)
-                ),
-            ) as p:
-                llm_generate()
-            print(p.key_averages().table(sort_by="self_cuda_time_total"))
+            llm.start_profile()
+            llm_generate()
+            llm.stop_profile()
         else:
             start_time = time.perf_counter()
             llm_generate()
@@ -103,11 +94,7 @@ def main(args: argparse.Namespace):
         run_to_completion(profile_dir=None)
 
     if args.profile:
-        profile_dir = args.profile_result_dir
-        if not profile_dir:
-            profile_dir = (
-                Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
-            )
+        profile_dir = envs.VLLM_TORCH_PROFILER_DIR
         print(f"Profiling (results will be saved to '{profile_dir}')...")
         run_to_completion(profile_dir=profile_dir)
         return
@@ -164,15 +151,6 @@ if __name__ == "__main__":
         action="store_true",
         help="profile the generation process of a single batch",
     )
-    parser.add_argument(
-        "--profile-result-dir",
-        type=str,
-        default=None,
-        help=(
-            "path to save the pytorch profiler output. Can be visualized "
-            "with ui.perfetto.dev or Tensorboard."
-        ),
-    )
     parser.add_argument(
         "--output-json",
         type=str,
@@ -193,4 +171,9 @@ if __name__ == "__main__":
     # numbers. We need to disable prefix caching by default.
     parser.set_defaults(enable_prefix_caching=False)
     args = parser.parse_args()
+    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
+        raise OSError(
+            "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
+            "Please set it to a valid path to use torch profiler."
+        )
     main(args)
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index 0dd938e75..c9e03cc3b 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -6,13 +6,12 @@ import dataclasses
 import json
 import os
 import time
-from pathlib import Path
 from typing import Any, Optional
 
 import numpy as np
-import torch
 from tqdm import tqdm
 
+import vllm.envs as envs
 from vllm import LLM, SamplingParams
 from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
                                    write_to_json)
@@ -59,13 +58,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
         action="store_true",
         help="profile the generation process of a single batch",
     )
-    parser.add_argument(
-        "--profile-result-dir",
-        type=str,
-        default=None,
-        help=("path to save the pytorch profiler output. Can be visualized "
-              "with ui.perfetto.dev or Tensorboard."),
-    )
     parser.add_argument(
         "--output-json",
         type=str,
@@ -87,7 +79,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
 
 def main(args: argparse.Namespace):
     print(args)
-
+    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
+        raise OSError(
+            "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
+            "Please set it to a valid path to use torch profiler.")
     engine_args = EngineArgs.from_cli_args(args)
 
     # NOTE(woosuk): If the request cannot be processed in a single batch,
@@ -131,16 +126,9 @@ def main(args: argparse.Namespace):
 
     def run_to_completion(profile_dir: Optional[str] = None):
         if profile_dir:
-            with torch.profiler.profile(
-                    activities=[
-                        torch.profiler.ProfilerActivity.CPU,
-                        torch.profiler.ProfilerActivity.CUDA,
-                    ],
-                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                        str(profile_dir)),
-            ) as p:
-                llm_generate()
-            print(p.key_averages().table(sort_by="self_cuda_time_total"))
+            llm.start_profile()
+            llm_generate()
+            llm.stop_profile()
         else:
             start_time = time.perf_counter()
             llm_generate()
@@ -153,10 +141,7 @@ def main(args: argparse.Namespace):
         run_to_completion(profile_dir=None)
 
     if args.profile:
-        profile_dir = args.profile_result_dir
-        if not profile_dir:
-            profile_dir = (Path(".") / "vllm_benchmark_result" /
-                           f"latency_result_{time.time()}")
+        profile_dir = envs.VLLM_TORCH_PROFILER_DIR
         print(f"Profiling (results will be saved to '{profile_dir}')...")
         run_to_completion(profile_dir=profile_dir)
         return
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index bce5cbb5f..dd06e7296 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -292,6 +292,8 @@ class Worker(WorkerBase):
             self.profiler.start()
         else:
             self.profiler.stop()
+            print(self.profiler.key_averages().table(
+                sort_by="self_cuda_time_total"))
 
     def execute_dummy_batch(self) -> None:
         self.model_runner._dummy_run(1)
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 6e45b8423..2a4317271 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -128,6 +128,8 @@ class Worker(LocalOrDistributedWorkerBase):
         if self.profiler is None:
             raise RuntimeError("Profiler is not enabled.")
         self.profiler.stop()
+        print(
+            self.profiler.key_averages().table(sort_by="self_cuda_time_total"))
 
     def sleep(self, level: int = 1) -> None:
         free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
-- 
GitLab


From 5e13c07d00df0ca906c4c06110c277421202d3e2 Mon Sep 17 00:00:00 2001
From: RonaldBXu <72748153+RonaldBXu@users.noreply.github.com>
Date: Tue, 27 May 2025 22:09:14 -0700
Subject: [PATCH 011/274] [V1] [Bugfix] eagle bugfix and enable correct lm_head
 for multimodal (2) (#18781)

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 vllm/transformers_utils/configs/eagle.py | 2 +-
 vllm/v1/spec_decode/eagle.py             | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index 377523efe..a43e4746c 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -70,7 +70,7 @@ class EAGLEConfig(PretrainedConfig):
 
         if self.model is not None:
             for k, v in self.model.to_dict().items():
-                if not hasattr(self, k):
+                if k not in kwargs:
                     setattr(self, k, v)
 
     @classmethod
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 971b06758..1ca856423 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -9,6 +9,7 @@ from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.models import supports_multimodal
 from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
 from vllm.v1.attention.backends.flash_attn import (CommonAttentionMetadata,
                                                    FlashAttentionMetadata)
@@ -346,7 +347,10 @@ class EagleProposer:
         if self.vllm_config.speculative_config.method != "eagle3" and \
                 hasattr(target_model, "lm_head"):
             logger.info("Loading EAGLE LM head weights from the target model.")
-            self.model.lm_head = target_model.lm_head
+            if supports_multimodal(target_model):
+                self.model.lm_head = target_model.get_language_model().lm_head
+            else:
+                self.model.lm_head = target_model.lm_head
 
     @torch.inference_mode()
     def dummy_run(
-- 
GitLab


From b78f844a6743732b58022f2f84858d61b40b5913 Mon Sep 17 00:00:00 2001
From: Rabi Mishra <ramishra@redhat.com>
Date: Wed, 28 May 2025 11:12:54 +0530
Subject: [PATCH 012/274] [Bugfix][FailingTest]Fix
 test_model_load_with_params.py (#18758)

Signed-off-by: rabi <ramishra@redhat.com>
---
 .buildkite/test-pipeline.yaml                 | 22 +++++++++----------
 .../test_logits_processor.py                  |  0
 .../test_model_load_with_params.py            | 19 +++++++---------
 .../{weight_utils.py => test_weight_utils.py} |  0
 4 files changed, 19 insertions(+), 22 deletions(-)
 rename tests/{ => model_executor}/test_logits_processor.py (100%)
 rename tests/model_executor/{weight_utils.py => test_weight_utils.py} (100%)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 80a5a610c..4e7bea25e 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -274,17 +274,6 @@ steps:
     - pytest -v -s samplers
     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
 
-- label: LogitsProcessor Test # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
-  source_file_dependencies:
-  - vllm/model_executor/layers
-  - vllm/model_executor/guided_decoding
-  - tests/test_logits_processor
-  - tests/model_executor/test_guided_processors
-  commands:
-    - pytest -v -s test_logits_processor.py
-    - pytest -v -s model_executor/test_guided_processors.py
-
 - label: Speculative decoding tests # 40min
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
@@ -397,6 +386,17 @@ steps:
     - pytest -v -s tensorizer_loader
     - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 
+- label: Model Executor Test
+  mirror_hardwares: [amdexperimental, amdproduction]
+  soft_fail: true
+  source_file_dependencies:
+  - vllm/model_executor
+  - tests/model_executor
+  commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s model_executor
+
 - label: Benchmarks # 9min
   mirror_hardwares: [amdexperimental, amdproduction]
   working_dir: "/vllm-workspace/.buildkite"
diff --git a/tests/test_logits_processor.py b/tests/model_executor/test_logits_processor.py
similarity index 100%
rename from tests/test_logits_processor.py
rename to tests/model_executor/test_logits_processor.py
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index f8efa2eff..7fda1f0e8 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -4,7 +4,7 @@ import os
 
 import pytest
 
-from vllm.model_executor.layers.pooler import CLSPool, PoolingType
+from vllm.model_executor.layers.pooler import CLSPool, MeanPool, PoolingType
 from vllm.model_executor.models.bert import BertEmbeddingModel
 from vllm.model_executor.models.roberta import RobertaEmbeddingModel
 from vllm.platforms import current_platform
@@ -14,7 +14,7 @@ MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5")
 REVISION = os.environ.get("REVISION", "main")
 
 MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME",
-                                    "intfloat/multilingual-e5-small")
+                                    "intfloat/multilingual-e5-base")
 REVISION_ROBERTA = os.environ.get("REVISION", "main")
 
 
@@ -40,17 +40,15 @@ def test_model_loading_with_params(vllm_runner):
 
         # asserts on the pooling config files
         assert model_config.pooler_config.pooling_type == PoolingType.CLS.name
-        assert model_config.pooler_config.pooling_norm
+        assert model_config.pooler_config.normalize
 
         # asserts on the tokenizer loaded
         assert model_tokenizer.tokenizer_id == "BAAI/bge-base-en-v1.5"
-        assert model_tokenizer.tokenizer_config["do_lower_case"]
         assert model_tokenizer.tokenizer.model_max_length == 512
 
         def check_model(model):
             assert isinstance(model, BertEmbeddingModel)
-            assert model._pooler.pooling_type == PoolingType.CLS
-            assert model._pooler.normalize
+            assert isinstance(model._pooler, CLSPool)
 
         vllm_model.apply_model(check_model)
 
@@ -80,16 +78,15 @@ def test_roberta_model_loading_with_params(vllm_runner):
 
         # asserts on the pooling config files
         assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
-        assert model_config.pooler_config.pooling_norm
+        assert model_config.pooler_config.normalize
 
         # asserts on the tokenizer loaded
-        assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-small"
-        assert not model_tokenizer.tokenizer_config["do_lower_case"]
+        assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-base"
+        assert model_tokenizer.tokenizer.model_max_length == 512
 
         def check_model(model):
             assert isinstance(model, RobertaEmbeddingModel)
-            assert model._pooler.pooling_type == PoolingType.MEAN
-            assert model._pooler.normalize
+            assert isinstance(model._pooler, MeanPool)
 
         vllm_model.apply_model(check_model)
 
diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/test_weight_utils.py
similarity index 100%
rename from tests/model_executor/weight_utils.py
rename to tests/model_executor/test_weight_utils.py
-- 
GitLab


From 7f2c1a87e9c34df7c8820ab6c916a608d518c1e9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 28 May 2025 15:08:35 +0800
Subject: [PATCH 013/274] [Deprecation] Require overriding `get_dummy_text` and
 `get_dummy_mm_data` (#18796)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/multimodal/profiling.py | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index a85b13fb2..53f5b243d 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-
-from abc import ABC
+from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
 from typing import Generic, NamedTuple, Optional, TypeVar, Union, cast
@@ -60,29 +59,14 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
 
         self.info = info
 
-    # TODO: @abstractmethod after transition
+    @abstractmethod
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         """
         Build the text input corresponding to `mm_counts`.
         """
-        if (type(self).get_dummy_processor_inputs ==
-                BaseDummyInputsBuilder.get_dummy_processor_inputs):
-            raise NotImplementedError
-
-        logger.warning_once("`get_dummy_processor_inputs` has been split up "
-                            "into `get_dummy_text` and `get_dummy_mm_data`. "
-                            "These two methods will be marked as abstract "
-                            "in an upcoming release.")
-
-        seq_len = self.info.ctx.model_config.max_model_len
-
-        prompt = self.get_dummy_processor_inputs(seq_len, mm_counts).prompt
-        if not isinstance(prompt, str):
-            prompt = self.info.get_tokenizer().decode(prompt)
-
-        return prompt
+        raise NotImplementedError
 
-    # TODO: @abstractmethod after transition
+    @abstractmethod
     def get_dummy_mm_data(
         self,
         seq_len: int,
-- 
GitLab


From 0f0926b43fb5445e2468034eb0bc8076cdd6af86 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 28 May 2025 15:08:48 +0800
Subject: [PATCH 014/274] [Deprecation] Remove unused sync methods in
 `async_timeout` (#18792)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/engine/async_timeout.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py
index aa54c0693..94674262b 100644
--- a/vllm/engine/async_timeout.py
+++ b/vllm/engine/async_timeout.py
@@ -8,7 +8,6 @@
 import asyncio
 import enum
 import sys
-import warnings
 from types import TracebackType
 from typing import Any, Optional, Type
 
@@ -66,24 +65,6 @@ else:
             else:
                 self.update(deadline)
 
-        def __enter__(self) -> "Timeout":
-            warnings.warn(
-                "with timeout() is deprecated, use async with timeout()",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-            self._do_enter()
-            return self
-
-        def __exit__(
-            self,
-            exc_type: Optional[Type[BaseException]],
-            exc_val: Optional[BaseException],
-            exc_tb: Optional[TracebackType],
-        ) -> Optional[bool]:
-            self._do_exit(exc_type)
-            return None
-
         async def __aenter__(self) -> "Timeout":
             self._do_enter()
             return self
-- 
GitLab


From 0c492b782420f9043e06a7c7f21ff62496a7a392 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 28 May 2025 15:09:04 +0800
Subject: [PATCH 015/274] [Deprecation] Remove fallbacks for Embeddings API
 (#18795)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py                        | 17 +++++-------
 vllm/entrypoints/openai/api_server.py | 39 ++++-----------------------
 vllm/outputs.py                       | 16 +----------
 3 files changed, 12 insertions(+), 60 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 4afdda3cc..738a9b337 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -797,17 +797,12 @@ class ModelConfig:
         else:
             # Aliases
             if task_option == "embedding":
-                preferred_task = self._get_preferred_task(
-                    architectures, supported_tasks)
-                if preferred_task != "embed":
-                    msg = ("The 'embedding' task will be restricted to "
-                           "embedding models in a future release. Please "
-                           "pass `--task classify`, `--task score`, or "
-                           "`--task reward` explicitly for other pooling "
-                           "models.")
-                    warnings.warn(msg, DeprecationWarning, stacklevel=2)
-
-                task_option = preferred_task or "embed"
+                msg = ("The 'embedding' task has been renamed to "
+                       "'embed', please use the new name. The old name "
+                       "will be removed in v1.0.")
+                warnings.warn(msg, DeprecationWarning, stacklevel=2)
+
+                task_option = "embed"
 
             if task_option not in supported_tasks:
                 msg = (
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2da89b4f5..b991cb3a4 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -17,7 +17,7 @@ from contextlib import asynccontextmanager
 from functools import partial
 from http import HTTPStatus
 from json import JSONDecodeError
-from typing import Annotated, Optional, Union
+from typing import Annotated, Optional
 
 import prometheus_client
 import regex as re
@@ -59,9 +59,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               EmbeddingChatRequest,
                                               EmbeddingCompletionRequest,
                                               EmbeddingRequest,
-                                              EmbeddingResponse,
-                                              EmbeddingResponseData,
-                                              ErrorResponse,
+                                              EmbeddingResponse, ErrorResponse,
                                               LoadLoRAAdapterRequest,
                                               PoolingChatRequest,
                                               PoolingCompletionRequest,
@@ -627,37 +625,10 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
 async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     handler = embedding(raw_request)
     if handler is None:
-        fallback_handler = pooling(raw_request)
-        if fallback_handler is None:
-            return base(raw_request).create_error_response(
-                message="The model does not support Embeddings API")
+        return base(raw_request).create_error_response(
+            message="The model does not support Embeddings API")
 
-        logger.warning(
-            "Embeddings API will become exclusive to embedding models "
-            "in a future release. To return the hidden states directly, "
-            "use the Pooling API (`/pooling`) instead.")
-
-        res = await fallback_handler.create_pooling(request, raw_request)
-
-        generator: Union[ErrorResponse, EmbeddingResponse]
-        if isinstance(res, PoolingResponse):
-            generator = EmbeddingResponse(
-                id=res.id,
-                object=res.object,
-                created=res.created,
-                model=res.model,
-                data=[
-                    EmbeddingResponseData(
-                        index=d.index,
-                        embedding=d.data,  # type: ignore
-                    ) for d in res.data
-                ],
-                usage=res.usage,
-            )
-        else:
-            generator = res
-    else:
-        generator = await handler.create_embedding(request, raw_request)
+    generator = await handler.create_embedding(request, raw_request)
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 33cc50c87..3960388bf 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -7,7 +7,7 @@ from dataclasses import dataclass
 from typing import Any, Generic, Optional, Union
 
 import torch
-from typing_extensions import TypeVar, deprecated
+from typing_extensions import TypeVar
 
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -76,14 +76,6 @@ class PoolingOutput:
         return (isinstance(other, self.__class__) and bool(
             (self.data == other.data).all()))
 
-    @property
-    @deprecated("`LLM.encode()` now stores raw outputs in the `data` "
-                "attribute. To return embeddings, use `LLM.embed()`. "
-                "To return class probabilities, use `LLM.classify()` "
-                "and access the `probs` attribute. ")
-    def embedding(self) -> list[float]:
-        return self.data.tolist()
-
 
 class RequestOutput:
     """The output data of a completion request to the LLM.
@@ -506,12 +498,6 @@ class ScoringOutput:
     def __repr__(self) -> str:
         return f"ScoringOutput(score={self.score})"
 
-    @property
-    @deprecated("`LLM.score()` now returns scalar scores. "
-                "Please access it via the `score` attribute. ")
-    def embedding(self) -> list[float]:
-        return [self.score]
-
 
 class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]):
 
-- 
GitLab


From de65fc8e1e7056e845a90bf4df4ff39c25cb03fa Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Wed, 28 May 2025 15:16:35 +0800
Subject: [PATCH 016/274] [CI] improve embed testing (#18747)

---
 .../openai/correctness/test_mteb.py           |  3 +-
 tests/entrypoints/openai/test_embedding.py    |  3 +-
 .../openai/test_embedding_dimensions.py       |  4 +-
 tests/models/language/pooling/embed_utils.py  | 72 ++++++++++++++
 tests/models/language/pooling/mteb_utils.py   | 11 ++-
 tests/models/language/pooling/test_baai.py    | 71 ++++++++++++++
 tests/models/language/pooling/test_gte.py     | 33 ++-----
 tests/models/language/pooling/test_jina.py    | 93 +++++++++----------
 tests/models/language/pooling/test_nomic.py   | 36 +++----
 .../pooling/test_snowflake_arctic_embed.py    | 40 ++------
 tests/models/registry.py                      |  2 +-
 tests/models/utils.py                         | 24 +----
 vllm/config.py                                | 26 +++---
 13 files changed, 244 insertions(+), 174 deletions(-)
 create mode 100644 tests/models/language/pooling/embed_utils.py
 create mode 100644 tests/models/language/pooling/test_baai.py

diff --git a/tests/entrypoints/openai/correctness/test_mteb.py b/tests/entrypoints/openai/correctness/test_mteb.py
index ebf2f829b..44d7ac193 100644
--- a/tests/entrypoints/openai/correctness/test_mteb.py
+++ b/tests/entrypoints/openai/correctness/test_mteb.py
@@ -4,6 +4,7 @@ import os
 import pytest
 
 from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
+                                                      MTEB_EMBED_TOL,
                                                       OpenAIClientMtebEncoder,
                                                       run_mteb_embed_task,
                                                       run_mteb_embed_task_st)
@@ -38,4 +39,4 @@ def test_mteb(server):
     print("SentenceTransformer main score: ", st_main_score)
     print("Difference: ", st_main_score - vllm_main_score)
 
-    assert st_main_score == pytest.approx(vllm_main_score, rel=1e-4)
+    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 1019bfd58..81ca65b65 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -11,7 +11,8 @@ import requests
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ...models.utils import run_embedding_correctness_test
+from ...models.language.pooling.embed_utils import (
+    run_embedding_correctness_test)
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "intfloat/multilingual-e5-small"
diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py
index 332fa332a..341defae0 100644
--- a/tests/entrypoints/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -11,7 +11,9 @@ import pytest
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 
 from ...conftest import HfRunner
-from ...models.utils import EmbedModelInfo, run_embedding_correctness_test
+from ...models.language.pooling.embed_utils import (
+    run_embedding_correctness_test)
+from ...models.utils import EmbedModelInfo
 from ...utils import RemoteOpenAIServer
 
 MODELS = [
diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py
new file mode 100644
index 000000000..0c8ac2ab1
--- /dev/null
+++ b/tests/models/language/pooling/embed_utils.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+from collections.abc import Sequence
+from typing import Optional
+
+import pytest
+
+from tests.conftest import HfRunner
+from tests.models.utils import (EmbedModelInfo, check_embeddings_close,
+                                matryoshka_fy)
+
+
+def run_embedding_correctness_test(
+    hf_model: "HfRunner",
+    inputs: list[str],
+    vllm_outputs: Sequence[list[float]],
+    dimensions: Optional[int] = None,
+):
+    hf_outputs = hf_model.encode(inputs)
+    if dimensions:
+        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
+
+
+def correctness_test_embed_models(hf_runner,
+                                  vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts,
+                                  vllm_extra_kwargs=None,
+                                  hf_model_callback=None):
+    if not model_info.enable_test:
+        # A model family has many models with the same architecture,
+        # and we don't need to test each one.
+        pytest.skip("Skipping test.")
+
+    # The example_prompts has ending "\n", for example:
+    # "Write a short story about a robot that dreams for the first time.\n"
+    # sentence_transformers will strip the input texts, see:
+    # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
+    # This makes the input_ids different between hf_model and vllm_model.
+    # So we need to strip the input texts to avoid test failing.
+    example_prompts = [str(s).strip() for s in example_prompts]
+
+    vllm_extra_kwargs = vllm_extra_kwargs or {}
+    vllm_extra_kwargs["dtype"] = model_info.dtype
+
+    with vllm_runner(model_info.name,
+                     task="embed",
+                     max_model_len=None,
+                     **vllm_extra_kwargs) as vllm_model:
+        vllm_outputs = vllm_model.encode(example_prompts)
+        vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
+        model_dtype = getattr(
+            vllm_model.model.llm_engine.model_config.hf_config, "torch_dtype",
+            vllm_dtype)
+
+    with hf_runner(
+            model_info.name,
+            dtype=model_dtype,
+            is_sentence_transformer=True,
+    ) as hf_model:
+
+        if hf_model_callback is not None:
+            hf_model_callback(hf_model)
+
+        run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index f83c9940d..f4837ae95 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -80,18 +80,19 @@ def run_mteb_embed_task_st(model_name, tasks):
 def mteb_test_embed_models(hf_runner,
                            vllm_runner,
                            model_info: EmbedModelInfo,
-                           vllm_extra_kwargs=None):
+                           vllm_extra_kwargs=None,
+                           hf_model_callback=None):
     if not model_info.enable_test:
         # A model family has many models with the same architecture,
         # and we don't need to test each one.
         pytest.skip("Skipping test.")
 
     vllm_extra_kwargs = vllm_extra_kwargs or {}
+    vllm_extra_kwargs["dtype"] = model_info.dtype
 
     with vllm_runner(model_info.name,
                      task="embed",
                      max_model_len=None,
-                     dtype=model_info.dtype,
                      **vllm_extra_kwargs) as vllm_model:
 
         if model_info.architecture:
@@ -108,10 +109,14 @@ def mteb_test_embed_models(hf_runner,
     with set_default_torch_dtype(model_dtype) and hf_runner(
             model_info.name, is_sentence_transformer=True,
             dtype=model_dtype) as hf_model:
+
+        if hf_model_callback is not None:
+            hf_model_callback(hf_model)
+
         st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
 
     print("VLLM:", vllm_dtype, vllm_main_score)
     print("SentenceTransformer:", model_dtype, st_main_score)
     print("Difference:", st_main_score - vllm_main_score)
 
-    assert st_main_score == pytest.approx(vllm_main_score, rel=MTEB_EMBED_TOL)
+    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py
new file mode 100644
index 000000000..fc0e82079
--- /dev/null
+++ b/tests/models/language/pooling/test_baai.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+from .embed_utils import EmbedModelInfo, correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models
+
+MODELS = [
+    ########## BertModel
+    EmbedModelInfo("BAAI/bge-base-en",
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("BAAI/bge-base-zh",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-en",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-zh",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-en",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-zh",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-zh-noinstruct",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-base-en-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-base-zh-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-en-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-zh-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-en-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-zh-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    ########## XLMRobertaModel
+    EmbedModelInfo("BAAI/bge-m3",
+                   architecture="XLMRobertaModel",
+                   enable_test=True),
+    ########## Qwen2Model
+    EmbedModelInfo("BAAI/bge-code-v1",
+                   architecture="Qwen2Model",
+                   dtype="float32",
+                   enable_test=True),
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
+                                  example_prompts)
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index 91d10f529..18b27a688 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -3,7 +3,8 @@ from typing import Any
 
 import pytest
 
-from ...utils import EmbedModelInfo, run_embedding_correctness_test
+from .embed_utils import EmbedModelInfo, correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
     ########## BertModel
@@ -53,9 +54,8 @@ MODELS = [
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_models_mteb(hf_runner, vllm_runner,
-                     model_info: EmbedModelInfo) -> None:
-    from .mteb_utils import mteb_test_embed_models
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
 
     vllm_extra_kwargs: dict[str, Any] = {}
     if model_info.architecture == "GteNewModel":
@@ -66,28 +66,13 @@ def test_models_mteb(hf_runner, vllm_runner,
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo,
-                            example_prompts) -> None:
-    if not model_info.enable_test:
-        pytest.skip("Skipping test.")
-
-    # ST will strip the input texts, see test_embedding.py
-    example_prompts = [str(s).strip() for s in example_prompts]
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
 
     vllm_extra_kwargs: dict[str, Any] = {}
     if model_info.architecture == "GteNewModel":
         vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
 
-    with vllm_runner(model_info.name,
-                     task="embed",
-                     dtype=model_info.dtype,
-                     max_model_len=None,
-                     **vllm_extra_kwargs) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
-
-    with hf_runner(
-            model_info.name,
-            dtype=model_info.dtype,
-            is_sentence_transformer=True,
-    ) as hf_model:
-        run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
+                                  example_prompts, vllm_extra_kwargs)
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
index 0ddff2146..0403a20a4 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -1,9 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
+from functools import partial
+
 import pytest
 
 from vllm import PoolingParams
 
-from ...utils import check_embeddings_close, matryoshka_fy
+from .embed_utils import (EmbedModelInfo, check_embeddings_close,
+                          correctness_test_embed_models, matryoshka_fy)
+from .mteb_utils import mteb_test_embed_models
 
 SCORING_MODELS = [
     "jinaai/jina-reranker-v2-base-multilingual",  # Roberta
@@ -25,16 +29,10 @@ TEXTS_2 = [
 ]
 
 EMBEDDING_MODELS = [
-    "jinaai/jina-embeddings-v3",
-]
-
-EMBEDDING_PROMPTS = [
-    "Follow the white rabbit.",  # English
-    "Sigue al conejo blanco.",  # Spanish
-    "Suis le lapin blanc.",  # French
-    "跟着白兔走。",  # Chinese
-    "اتبع الأرنب الأبيض.",  # Arabic
-    "Folge dem weißen Kaninchen.",  # German
+    EmbedModelInfo("jinaai/jina-embeddings-v3",
+                   architecture="XLMRobertaModel",
+                   is_matryoshka=True,
+                   dtype="float32")
 ]
 
 
@@ -80,73 +78,66 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
     assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
 
 
-@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
-def emb_model_name(request):
-    yield request.param
+@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
 
+    def hf_model_callback(model):
+        model.encode = partial(model.encode, task="text-matching")
 
-def test_is_matryoshka(vllm_runner, emb_model_name):
-    with vllm_runner(emb_model_name, task="embed",
-                     max_model_len=None) as vllm_model:
-        assert vllm_model.model.llm_engine.model_config.is_matryoshka
-
-
-@pytest.mark.parametrize("model", EMBEDDING_MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-def test_embeddings(
-    hf_runner,
-    vllm_runner,
-    model,
-    dtype: str,
-    monkeypatch,
-) -> None:
+    mteb_test_embed_models(hf_runner,
+                           vllm_runner,
+                           model_info,
+                           hf_model_callback=hf_model_callback)
 
-    example_prompts = EMBEDDING_PROMPTS
 
-    with hf_runner(
-            model,
-            dtype=dtype,
-            is_sentence_transformer=True,
-    ) as hf_model:
-        hf_outputs = hf_model.encode(example_prompts, task="text-matching")
+@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
 
-    with vllm_runner(model, task="embed", dtype=dtype,
-                     max_model_len=None) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
+    def hf_model_callback(model):
+        model.encode = partial(model.encode, task="text-matching")
 
-    check_embeddings_close(
-        embeddings_0_lst=hf_outputs,
-        embeddings_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-        tol=1e-2,
-    )
+    correctness_test_embed_models(hf_runner,
+                                  vllm_runner,
+                                  model_info,
+                                  example_prompts,
+                                  hf_model_callback=hf_model_callback)
 
 
-@pytest.mark.parametrize("model", EMBEDDING_MODELS)
+@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("dimensions", [16, 32])
 def test_matryoshka(
     hf_runner,
     vllm_runner,
-    model,
+    model_info,
     dtype: str,
     dimensions: int,
+    example_prompts,
     monkeypatch,
 ) -> None:
+    if not model_info.is_matryoshka:
+        pytest.skip("Model is not matryoshka")
 
-    example_prompts = EMBEDDING_PROMPTS
+    # ST will strip the input texts, see test_embedding.py
+    example_prompts = [str(s).strip() for s in example_prompts]
 
     with hf_runner(
-            model,
+            model_info.name,
             dtype=dtype,
             is_sentence_transformer=True,
     ) as hf_model:
         hf_outputs = hf_model.encode(example_prompts, task="text-matching")
         hf_outputs = matryoshka_fy(hf_outputs, dimensions)
 
-    with vllm_runner(model, task="embed", dtype=dtype,
+    with vllm_runner(model_info.name,
+                     task="embed",
+                     dtype=dtype,
                      max_model_len=None) as vllm_model:
+        assert vllm_model.model.llm_engine.model_config.is_matryoshka
+
         matryoshka_dimensions = (
             vllm_model.model.llm_engine.model_config.matryoshka_dimensions)
         assert matryoshka_dimensions is not None
diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py
index 28df32e0c..92cd7cc56 100644
--- a/tests/models/language/pooling/test_nomic.py
+++ b/tests/models/language/pooling/test_nomic.py
@@ -2,7 +2,8 @@
 
 import pytest
 
-from ...utils import EmbedModelInfo, run_embedding_correctness_test
+from .embed_utils import EmbedModelInfo, correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
     EmbedModelInfo("nomic-ai/nomic-embed-text-v1",
@@ -13,6 +14,9 @@ MODELS = [
                    architecture="NomicBertModel",
                    dtype="float32",
                    enable_test=False),
+    EmbedModelInfo("nomic-ai/CodeRankEmbed",
+                   architecture="NomicBertModel",
+                   enable_test=False),
     EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
                    architecture="NomicBertModel",
                    dtype="float32",
@@ -21,30 +25,14 @@ MODELS = [
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_models_mteb(hf_runner, vllm_runner,
-                     model_info: EmbedModelInfo) -> None:
-    from .mteb_utils import mteb_test_embed_models
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo,
-                            example_prompts) -> None:
-    if not model_info.enable_test:
-        pytest.skip("Skipping test.")
-
-    # ST will strip the input texts, see test_embedding.py
-    example_prompts = [str(s).strip() for s in example_prompts]
-
-    with vllm_runner(model_info.name,
-                     task="embed",
-                     dtype=model_info.dtype,
-                     max_model_len=None) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
-
-    with hf_runner(
-            model_info.name,
-            dtype=model_info.dtype,
-            is_sentence_transformer=True,
-    ) as hf_model:
-        run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
+                                  example_prompts)
diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py
index 5679e0e1c..c6c2d1e7a 100644
--- a/tests/models/language/pooling/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py
@@ -2,7 +2,8 @@
 
 import pytest
 
-from ...utils import EmbedModelInfo, run_embedding_correctness_test
+from .embed_utils import EmbedModelInfo, correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
     EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
@@ -41,37 +42,14 @@ MODELS = [
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_models_mteb(
-    hf_runner,
-    vllm_runner,
-    model_info: EmbedModelInfo,
-) -> None:
-    from .mteb_utils import mteb_test_embed_models
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_models_correctness(
-    hf_runner,
-    vllm_runner,
-    model_info: EmbedModelInfo,
-    example_prompts,
-) -> None:
-    if not model_info.enable_test:
-        pytest.skip("Skipping test.")
-
-    # ST will strip the input texts, see test_embedding.py
-    example_prompts = [str(s).strip() for s in example_prompts]
-
-    with vllm_runner(model_info.name,
-                     task="embed",
-                     dtype=model_info.dtype,
-                     max_model_len=None) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
-
-    with hf_runner(
-            model_info.name,
-            dtype=model_info.dtype,
-            is_sentence_transformer=True,
-    ) as hf_model:
-        run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
+                                  example_prompts)
diff --git a/tests/models/registry.py b/tests/models/registry.py
index a49e3ad6b..18342b671 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -283,7 +283,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
     "ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base",
                                 trust_remote_code=True),
-    "NomicBertModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-long",  # noqa: E501
+    "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe",
                                                trust_remote_code=True),
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
     "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
diff --git a/tests/models/utils.py b/tests/models/utils.py
index a43fd77c6..ac1fc6c8f 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -2,7 +2,7 @@
 
 import warnings
 from collections.abc import Sequence
-from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
+from typing import Any, NamedTuple, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -13,9 +13,6 @@ from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 
 from .registry import HF_EXAMPLE_MODELS
 
-if TYPE_CHECKING:
-    from ..conftest import HfRunner
-
 TokensText = tuple[list[int], str]
 
 
@@ -337,22 +334,3 @@ class EmbedModelInfo(NamedTuple):
     architecture: str = ""
     dtype: str = "auto"
     enable_test: bool = True
-
-
-def run_embedding_correctness_test(
-    hf_model: "HfRunner",
-    inputs: list[str],
-    vllm_outputs: Sequence[list[float]],
-    dimensions: Optional[int] = None,
-):
-    hf_outputs = hf_model.encode(inputs)
-    if dimensions:
-        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
-
-    check_embeddings_close(
-        embeddings_0_lst=hf_outputs,
-        embeddings_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-        tol=1e-2,
-    )
diff --git a/vllm/config.py b/vllm/config.py
index 738a9b337..30d1a64a4 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -572,13 +572,7 @@ class ModelConfig:
                 sliding_window = None
 
         self.original_max_model_len = self.max_model_len
-        self.max_model_len = _get_and_verify_max_len(
-            hf_config=self.hf_text_config,
-            max_model_len=self.max_model_len,
-            disable_sliding_window=self.disable_sliding_window,
-            sliding_window_len=self.get_hf_config_sliding_window(),
-            spec_target_max_model_len=self.spec_target_max_model_len,
-            encoder_config=self.encoder_config)
+        self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
         self.served_model_name = get_served_model_name(self.model,
                                                        self.served_model_name)
         self.multimodal_config = self._init_multimodal_config()
@@ -1382,6 +1376,16 @@ class ModelConfig:
     def matryoshka_dimensions(self):
         return getattr(self.hf_config, "matryoshka_dimensions", None)
 
+    def get_and_verify_max_len(self, max_model_len: int):
+        max_model_len = _get_and_verify_max_len(
+            hf_config=self.hf_text_config,
+            max_model_len=max_model_len,
+            disable_sliding_window=self.disable_sliding_window,
+            sliding_window_len=self.get_hf_config_sliding_window(),
+            spec_target_max_model_len=self.spec_target_max_model_len,
+            encoder_config=self.encoder_config)
+        return max_model_len
+
 
 BlockSize = Literal[1, 8, 16, 32, 64, 128]
 CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2"]
@@ -4469,13 +4473,7 @@ class VllmConfig:
 
     def recalculate_max_model_len(self, max_model_len: int):
         model_config = self.model_config
-        max_model_len = _get_and_verify_max_len(
-            hf_config=model_config.hf_text_config,
-            max_model_len=max_model_len,
-            disable_sliding_window=model_config.disable_sliding_window,
-            sliding_window_len=model_config.get_hf_config_sliding_window(),
-            spec_target_max_model_len=model_config.spec_target_max_model_len,
-            encoder_config=model_config.encoder_config)
+        max_model_len = model_config.get_and_verify_max_len(max_model_len)
         self.model_config.max_model_len = max_model_len
         self.scheduler_config.max_model_len = max_model_len
         self.compute_hash()
-- 
GitLab


From aa42561e4054f43f41bf0ea564369f5ea3147316 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Wed, 28 May 2025 04:40:53 -0400
Subject: [PATCH 017/274] Fix PiecewiseCompileInterpreter (#17338)

Signed-off-by: rzou <zou3519@gmail.com>
---
 vllm/compilation/backends.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 8114cddcd..0358c9d0d 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -10,6 +10,7 @@ from typing import Any, Callable, Optional
 
 import torch
 import torch.fx as fx
+from torch._dispatch.python import enable_python_dispatcher
 
 import vllm.envs as envs
 from vllm.config import CompilationConfig, VllmConfig
@@ -269,7 +270,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
             self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
             for t in args
         ]
-        with self.fake_mode:
+        with self.fake_mode, enable_python_dispatcher():
             return super().run(*fake_args)
 
     def call_module(self, target: torch.fx.node.Target,
-- 
GitLab


From ce75efeecb57acb5421aeb545a95e922f3dc8b3e Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 28 May 2025 04:59:39 -0400
Subject: [PATCH 018/274] [BugFix] FA2 MLA Accuracy Issue (#18807)

Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
---
 csrc/attention/merge_attn_states.cu      | 8 ++++++++
 vllm/attention/backends/mla/common.py    | 8 ++++----
 vllm/v1/attention/backends/mla/common.py | 8 ++++----
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu
index 14e5edd7e..6bee9e4ce 100644
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -143,6 +143,14 @@ void merge_attn_states_launcher(torch::Tensor& output,
   const uint pack_size = 16 / sizeof(scalar_t);
   TORCH_CHECK(head_size % pack_size == 0,
               "headsize must be multiple of pack_size:", pack_size);
+  TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
+              "output heads must be contiguous in memory");
+  TORCH_CHECK(
+      prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
+      "prefix_output heads must be contiguous in memory");
+  TORCH_CHECK(
+      suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
+      "suffix_output heads must be contiguous in memory");
   float* output_lse_ptr = nullptr;
   if (output_lse.has_value()) {
     output_lse_ptr = output_lse.value().data_ptr<float>();
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index d48462684..1007140ef 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -1093,10 +1093,6 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
         if isinstance(attn_out, tuple):
             attn_out, *rest = attn_out
 
-        # unpad if necessary
-        if self._pad_v:
-            attn_out = attn_out[..., :v.shape[-1]]
-
         # Remain consistent with old `flash_attn_varlen_func` where there
         # is only one output tensor if `return_softmax_lse` is False.
         if return_softmax_lse:
@@ -1294,6 +1290,10 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
                 suffix_lse=suffix_lse,
             )
 
+        # unpad if necessary
+        if self._pad_v:
+            output = output[..., :v.shape[-1]]
+
         return output.flatten(start_dim=-2)
 
     @abstractmethod
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 83e181116..1edfab26b 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -653,10 +653,6 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         if isinstance(attn_out, tuple):
             attn_out, lse = attn_out[0], attn_out[1]
 
-        # unpad if necessary
-        if self._pad_v:
-            attn_out = attn_out[..., :v.shape[-1]]
-
         # Remain consistent with old `flash_attn_varlen_func` where there
         # is only one output tensor if `return_softmax_lse` is False.
         if return_softmax_lse:
@@ -839,6 +835,10 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
                 suffix_lse=suffix_lse,
             )
 
+        # unpad if necessary
+        if self._pad_v:
+            output = output[..., :v.shape[-1]]
+
         return output.flatten(start_dim=-2)
 
     @abstractmethod
-- 
GitLab


From d781930f90c0863ee1978820caf0568f59bf39ca Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Wed, 28 May 2025 18:52:34 +0800
Subject: [PATCH 019/274] [Platform][Dist] Make torch distributed process group
 extendable (#18763)

Signed-off-by: Mengqing Cao <cmq0113@163.com>
---
 vllm/distributed/utils.py   | 89 ++++++++++++++++++++++---------------
 vllm/platforms/cuda.py      | 33 ++++++++++++++
 vllm/platforms/interface.py | 16 +++++++
 vllm/platforms/rocm.py      | 33 ++++++++++++++
 4 files changed, 134 insertions(+), 37 deletions(-)

diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 93a069d36..96d08dc1a 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -5,7 +5,6 @@
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 import dataclasses
-import datetime
 import os
 import pickle
 import socket
@@ -14,14 +13,14 @@ import time
 import uuid
 from collections import deque
 from collections.abc import Sequence
+from datetime import timedelta
 from typing import Any, Optional
 
 import torch
 from torch.distributed import ProcessGroup, TCPStore
 from torch.distributed.distributed_c10d import (Backend, PrefixStore,
                                                 _get_default_timeout,
-                                                _unregister_process_group,
-                                                is_nccl_available)
+                                                _unregister_process_group)
 from torch.distributed.rendezvous import rendezvous
 
 import vllm.envs as envs
@@ -406,7 +405,7 @@ class StatelessProcessGroup:
             port=port,
             world_size=world_size,
             is_master=launch_server,
-            timeout=datetime.timedelta(seconds=store_timeout),
+            timeout=timedelta(seconds=store_timeout),
             use_libuv=False,  # for now: github.com/pytorch/pytorch/pull/150215
             master_listen_fd=listen_fd,
         )
@@ -419,6 +418,43 @@ class StatelessProcessGroup:
             data_expiration_seconds=data_expiration_seconds)
 
 
+def init_gloo_process_group(backend: Backend, prefix_store: PrefixStore,
+                            group_rank: int, group_size: int,
+                            timeout: timedelta) -> ProcessGroup:
+    """
+    Stateless init ProcessGroup with gloo backend compatible with 
+    different torch versions.
+    """
+    if is_torch_equal_or_newer("2.6"):
+        pg = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+        )
+    else:
+        options = ProcessGroup.Options(backend=backend)
+        pg = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+            options,
+        )
+    from torch.distributed.distributed_c10d import ProcessGroupGloo
+    backend_class = ProcessGroupGloo(prefix_store,
+                                     group_rank,
+                                     group_size,
+                                     timeout=timeout)
+    backend_type = ProcessGroup.BackendType.GLOO
+    device = torch.device("cpu")
+    if is_torch_equal_or_newer("2.6"):
+        # _set_default_backend is supported in torch >= 2.6
+        pg._set_default_backend(backend_type)
+    backend_class._set_sequence_number_for_group()
+
+    pg._register_backend(device, backend_type, backend_class)
+    return pg
+
+
 def stateless_init_torch_distributed_process_group(
         host: str, port: int, rank: int, world_size: int,
         backend: str) -> ProcessGroup:
@@ -468,40 +504,19 @@ def stateless_init_torch_distributed_process_group(
     # different systems (e.g. RPC) in case the store is multi-tenant.
     prefix_store = PrefixStore(init_method, store)
 
-    pg: ProcessGroup = ProcessGroup(
-        prefix_store,
-        group_rank,
-        group_size,
-    )
-
     if backend == "gloo":
-        from torch.distributed.distributed_c10d import ProcessGroupGloo
-        backend_class = ProcessGroupGloo(prefix_store,
-                                         group_rank,
-                                         group_size,
-                                         timeout=timeout)
-        backend_type = ProcessGroup.BackendType.GLOO
-        device = torch.device("cpu")
-    elif backend == "nccl":
-        assert is_nccl_available()
-        from torch.distributed.distributed_c10d import ProcessGroupNCCL
-
-        backend_options = ProcessGroupNCCL.Options()
-        backend_options._timeout = timeout
-
-        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
-                                         backend_options)
-        backend_type = ProcessGroup.BackendType.NCCL
-        device = torch.device("cuda")
-    else:
-        raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
-
-    pg._set_default_backend(backend_type)
-    backend_class._set_sequence_number_for_group()
-
-    pg._register_backend(device, backend_type, backend_class)
-
-    return pg
+        return init_gloo_process_group(backend=backend,
+                                       prefix_store=prefix_store,
+                                       group_rank=group_rank,
+                                       group_size=group_size,
+                                       timeout=timeout)
+    from vllm.platforms import current_platform
+    return current_platform.stateless_init_device_torch_dist_pg(
+        backend=backend,
+        prefix_store=prefix_store,
+        group_rank=group_rank,
+        group_size=group_size,
+        timeout=timeout)
 
 
 def stateless_destroy_torch_distributed_process_group(
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 8bb3dfe74..0bed44f73 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -4,10 +4,13 @@ pynvml. However, it should not initialize cuda context.
 """
 
 import os
+from datetime import timedelta
 from functools import wraps
 from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
 
 import torch
+from torch.distributed import PrefixStore, ProcessGroup
+from torch.distributed.distributed_c10d import is_nccl_available
 from typing_extensions import ParamSpec
 
 # import custom ops, trigger op registration
@@ -316,6 +319,36 @@ class CudaPlatformBase(Platform):
     def get_piecewise_backend_cls(cls) -> str:
         return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
 
+    @classmethod
+    def stateless_init_device_torch_dist_pg(
+        cls,
+        backend: str,
+        prefix_store: PrefixStore,
+        group_rank: int,
+        group_size: int,
+        timeout: timedelta,
+    ) -> ProcessGroup:
+        assert is_nccl_available()
+        pg: ProcessGroup = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+        )
+        from torch.distributed.distributed_c10d import ProcessGroupNCCL
+
+        backend_options = ProcessGroupNCCL.Options()
+        backend_options._timeout = timeout
+
+        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
+                                         backend_options)
+        backend_type = ProcessGroup.BackendType.NCCL
+        device = torch.device("cuda")
+        pg._set_default_backend(backend_type)
+        backend_class._set_sequence_number_for_group()
+
+        pg._register_backend(device, backend_type, backend_class)
+        return pg
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 504c3b42a..5c4f7a2f7 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -3,11 +3,13 @@ import enum
 import os
 import platform
 import random
+from datetime import timedelta
 from platform import uname
 from typing import TYPE_CHECKING, NamedTuple, Optional, Union
 
 import numpy as np
 import torch
+from torch.distributed import PrefixStore, ProcessGroup
 
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
@@ -486,6 +488,20 @@ class Platform:
         """
         return "vllm.compilation.base_piecewise_backend.AbstractPiecewiseBackend"  # noqa
 
+    @classmethod
+    def stateless_init_device_torch_dist_pg(
+        cls,
+        backend: str,
+        prefix_store: PrefixStore,
+        group_rank: int,
+        group_size: int,
+        timeout: timedelta,
+    ) -> ProcessGroup:
+        """
+        Init platform-specific torch distributed process group.
+        """
+        raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index b5e742c65..d544b4ab4 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,10 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+from datetime import timedelta
 from functools import cache, lru_cache, wraps
 from typing import TYPE_CHECKING, Optional
 
 import torch
+from torch.distributed import PrefixStore, ProcessGroup
+from torch.distributed.distributed_c10d import is_nccl_available
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -387,3 +390,33 @@ class RocmPlatform(Platform):
     @classmethod
     def get_piecewise_backend_cls(cls) -> str:
         return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
+
+    @classmethod
+    def stateless_init_device_torch_dist_pg(
+        cls,
+        backend: str,
+        prefix_store: PrefixStore,
+        group_rank: int,
+        group_size: int,
+        timeout: timedelta,
+    ) -> ProcessGroup:
+        assert is_nccl_available()
+        pg: ProcessGroup = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+        )
+        from torch.distributed.distributed_c10d import ProcessGroupNCCL
+
+        backend_options = ProcessGroupNCCL.Options()
+        backend_options._timeout = timeout
+
+        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
+                                         backend_options)
+        backend_type = ProcessGroup.BackendType.NCCL
+        device = torch.device("cuda")
+        pg._set_default_backend(backend_type)
+        backend_class._set_sequence_number_for_group()
+
+        pg._register_backend(device, backend_type, backend_class)
+        return pg
-- 
GitLab


From 4c2b38ce9e90a0ac7c3e7ca400daf3a622cc7bca Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 28 May 2025 13:46:04 +0100
Subject: [PATCH 020/274] Enable Pydantic mypy checks and convert configs to
 Pydantic dataclasses (#17599)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .pre-commit-config.yaml                       |   2 +-
 pyproject.toml                                |   1 +
 tests/lora/test_quant_model.py                |  14 +--
 tests/tracing/test_tracing.py                 |   2 +-
 vllm/config.py                                | 108 ++++++++++--------
 vllm/engine/arg_utils.py                      |  43 +++----
 vllm/entrypoints/llm.py                       |   5 +-
 vllm/entrypoints/openai/protocol.py           |  12 +-
 vllm/entrypoints/openai/serving_engine.py     |  16 +--
 .../guided_decoding/guided_fields.py          |   6 +-
 vllm/utils.py                                 |   8 --
 11 files changed, 115 insertions(+), 102 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b45619a32..628782228 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -58,7 +58,7 @@ repos:
     entry: tools/mypy.sh 0 "local"
     language: python
     types: [python]
-    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
     stages: [pre-commit] # Don't run in CI
   - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.9
diff --git a/pyproject.toml b/pyproject.toml
index 62a734d79..eb55a9ffc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -110,6 +110,7 @@ ignore = [
 ]
 
 [tool.mypy]
+plugins = ['pydantic.mypy']
 ignore_missing_imports = true
 check_untyped_defs = true
 follow_imports = "silent"
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index caf71976a..7a76ffb74 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -24,16 +24,16 @@ if current_platform.is_rocm():
     MODELS = [
         ModelWithQuantization(
             model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-            quantization="GPTQ"),
+            quantization="gptq"),
     ]
 else:
     MODELS = [
         ModelWithQuantization(
             model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
-            quantization="AWQ"),
+            quantization="awq"),
         ModelWithQuantization(
             model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-            quantization="GPTQ"),
+            quantization="gptq"),
     ]
 
 
@@ -100,7 +100,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):
             "#ff8050",
             "#ff8080",
         ]
-    elif model.quantization == "AWQ":
+    elif model.quantization == "awq":
         expected_no_lora_output = [
             "I'm sorry, I don't understand",
             "I'm sorry, I don't understand",
@@ -109,7 +109,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):
             "#f07700: A v",
             "#f00000: A v",
         ]
-    elif model.quantization == "GPTQ":
+    elif model.quantization == "gptq":
         expected_no_lora_output = [
             "I'm sorry, I don't have",
             "I'm sorry, I don't have",
@@ -122,7 +122,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):
     def expect_match(output, expected_output):
         # HACK: GPTQ lora outputs are just incredibly unstable.
         # Assert that the outputs changed.
-        if (model.quantization == "GPTQ"
+        if (model.quantization == "gptq"
                 and expected_output is expected_lora_output):
             assert output != expected_no_lora_output
             for i, o in enumerate(output):
@@ -172,7 +172,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
                                  model):
     if num_gpus_available < 2:
         pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
-    if model.quantization == "GPTQ":
+    if model.quantization == "gptq":
         pytest.skip("GPTQ lora outputs are just incredibly unstable")
     llm_tp1 = vllm.LLM(
         model=model.model_path,
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index a781b8b56..caa233ec3 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -173,7 +173,7 @@ def test_traces_with_detailed_steps(
         llm = LLM(
             model=model,
             otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-            collect_detailed_traces="all",
+            collect_detailed_traces=["all"],
         )
         prompts = ["This is a short prompt"]
         outputs = llm.generate(prompts, sampling_params=sampling_params)
diff --git a/vllm/config.py b/vllm/config.py
index 30d1a64a4..4d9ca580f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -11,8 +11,8 @@ import uuid
 import warnings
 from collections import Counter
 from contextlib import contextmanager
-from dataclasses import (MISSING, Field, asdict, dataclass, field, fields,
-                         is_dataclass, replace)
+from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass,
+                         replace)
 from functools import cached_property
 from importlib.util import find_spec
 from pathlib import Path
@@ -21,9 +21,12 @@ from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional,
 
 import regex as re
 import torch
+from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator,
+                      model_validator)
+from pydantic.dataclasses import dataclass
 from torch.distributed import ProcessGroup, ReduceOp
 from transformers import PretrainedConfig
-from typing_extensions import deprecated
+from typing_extensions import deprecated, runtime_checkable
 
 import vllm.envs as envs
 from vllm import version
@@ -57,10 +60,15 @@ if TYPE_CHECKING:
     from vllm.model_executor.layers.quantization.base_config import (
         QuantizationConfig)
     from vllm.model_executor.model_loader import BaseModelLoader
+    from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 
     ConfigType = type[DataclassInstance]
 else:
+    PlacementGroup = Any
+    ExecutorBase = Any
     QuantizationConfig = Any
+    BaseModelLoader = Any
+    TensorizerConfig = Any
     ConfigType = type
 
 logger = init_logger(__name__)
@@ -92,6 +100,7 @@ HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig],
                                              PretrainedConfig]]
 
 
+@runtime_checkable
 class SupportsHash(Protocol):
 
     def compute_hash(self) -> str:
@@ -223,7 +232,7 @@ ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 
 
 @config
-@dataclass
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class ModelConfig:
     """Configuration for the model."""
 
@@ -236,7 +245,7 @@ class ModelConfig:
     task, even if the same model can be used for multiple tasks. When the model
     only supports one task, "auto" can be used to select it; otherwise, you
     must specify explicitly which task to use."""
-    tokenizer: str = None  # type: ignore
+    tokenizer: SkipValidation[str] = None  # type: ignore
     """Name or path of the Hugging Face tokenizer to use. If unspecified, model
     name or path will be used."""
     tokenizer_mode: TokenizerMode = "auto"
@@ -284,7 +293,7 @@ class ModelConfig:
     """The specific revision to use for the tokenizer on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
-    max_model_len: int = None  # type: ignore
+    max_model_len: SkipValidation[int] = None  # type: ignore
     """Model context length (prompt and output). If unspecified, will be
     automatically derived from the model config.
 
@@ -602,6 +611,22 @@ class ModelConfig:
         self._verify_cuda_graph()
         self._verify_bnb_config()
 
+    @field_validator("quantization", mode="before")
+    @classmethod
+    def validate_quantization_before(cls, value: Any) -> Any:
+        if isinstance(value, str):
+            return value.lower()
+        return value
+
+    @model_validator(mode="after")
+    def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
+        if not isinstance(self.tokenizer, str):
+            raise ValueError("tokenizer must be a string after __post_init__.")
+        if not isinstance(self.max_model_len, int):
+            raise ValueError(
+                "max_model_len must be an integer after __post_init__.")
+        return self
+
     @property
     def registry(self):
         return ModelRegistry
@@ -823,8 +848,7 @@ class ModelConfig:
             "quark", "modelopt_fp4", "bitblas", "gptq_bitblas"
         ]
         if self.quantization is not None:
-            self.quantization = cast(QuantizationMethods,
-                                     self.quantization.lower())
+            self.quantization = cast(QuantizationMethods, self.quantization)
 
         # Parse quantization method from the HF model config, if available.
         quant_cfg = self._parse_quant_hf_config()
@@ -1397,7 +1421,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"]
 class CacheConfig:
     """Configuration for the KV cache."""
 
-    block_size: BlockSize = None  # type: ignore
+    block_size: SkipValidation[BlockSize] = None  # type: ignore
     """Size of a contiguous cache block in number of tokens. This is ignored on
     neuron devices and set to `--max-model-len`. On CUDA devices, only block
     sizes up to 32 are supported. On HPU devices, block size defaults to 128.
@@ -1619,7 +1643,8 @@ class LoadConfig:
     download_dir: Optional[str] = None
     """Directory to download and load the weights, default to the default
     cache directory of Hugging Face."""
-    model_loader_extra_config: dict = field(default_factory=dict)
+    model_loader_extra_config: Union[dict, TensorizerConfig] = field(
+        default_factory=dict)
     """Extra config for model loader. This will be passed to the model loader
     corresponding to the chosen load_format."""
     ignore_patterns: Optional[Union[list[str], str]] = None
@@ -1929,19 +1954,19 @@ class SchedulerConfig:
     runner_type: RunnerType = "generate"
     """The runner type to launch for the model."""
 
-    max_num_batched_tokens: int = None  # type: ignore
+    max_num_batched_tokens: SkipValidation[int] = None  # type: ignore
     """Maximum number of tokens to be processed in a single iteration.
 
     This config has no static default. If left unspecified by the user, it will
     be set in `EngineArgs.create_engine_config` based on the usage context."""
 
-    max_num_seqs: int = None  # type: ignore
+    max_num_seqs: SkipValidation[int] = None  # type: ignore
     """Maximum number of sequences to be processed in a single iteration.
 
     This config has no static default. If left unspecified by the user, it will
     be set in `EngineArgs.create_engine_config` based on the usage context."""
 
-    max_model_len: int = None  # type: ignore
+    max_model_len: SkipValidation[int] = None  # type: ignore
     """Maximum length of a sequence (including prompt and generated text). This
     is primarily set in `ModelConfig` and that value should be manually
     duplicated here."""
@@ -1980,7 +2005,7 @@ class SchedulerConfig:
     """Apply a delay (of delay factor multiplied by previous
     prompt latency) before scheduling next prompt."""
 
-    enable_chunked_prefill: bool = None  # type: ignore
+    enable_chunked_prefill: SkipValidation[bool] = None  # type: ignore
     """If True, prefill requests can be chunked based
     on the remaining max_num_batched_tokens."""
 
@@ -2202,7 +2227,7 @@ Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"]
 
 
 @config
-@dataclass
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class DeviceConfig:
     """Configuration for the device to use for vLLM execution."""
 
@@ -2260,8 +2285,8 @@ class DeviceConfig:
             self.device = torch.device(self.device_type)
 
 
-SpeculativeMethod = Literal["ngram", "eagle", "medusa", "mlp_speculator",
-                            "draft_model", "deepseek_mtp"]
+SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa",
+                            "mlp_speculator", "draft_model", "deepseek_mtp"]
 SpeculativeAcceptanceMethod = Literal["rejection_sampler",
                                       "typical_acceptance_sampler"]
 
@@ -2272,8 +2297,7 @@ class SpeculativeConfig:
     """Configuration for speculative decoding."""
 
     # General speculative decoding control
-    num_speculative_tokens: int = field(default=None,
-                                        init=True)  # type: ignore
+    num_speculative_tokens: SkipValidation[int] = None  # type: ignore
     """The number of speculative tokens, if provided. It will default to the
     number in the draft model config if present, otherwise, it is required."""
     model: Optional[str] = None
@@ -2349,26 +2373,23 @@ class SpeculativeConfig:
     """Specifies the tree structure for speculative token generation.
     """
     # required configuration params passed from engine
-    target_model_config: ModelConfig = field(default=None,
-                                             init=True)  # type: ignore
+    target_model_config: SkipValidation[ModelConfig] = None  # type: ignore
     """The configuration of the target model."""
-    target_parallel_config: ParallelConfig = field(default=None,
-                                                   init=True)  # type: ignore
+    target_parallel_config: SkipValidation[
+        ParallelConfig] = None  # type: ignore
     """The parallel configuration for the target model."""
-    enable_chunked_prefill: bool = field(default=None,
-                                         init=True)  # type: ignore
+    enable_chunked_prefill: SkipValidation[bool] = None  # type: ignore
     """Whether vLLM is configured to use chunked prefill or not. Used for
     raising an error since it's not yet compatible with speculative decode."""
-    disable_log_stats: bool = field(default=None, init=True)  # type: ignore
+    disable_log_stats: SkipValidation[bool] = None  # type: ignore
     """Whether to disable the periodic printing of stage times in speculative
     decoding."""
 
     # params generated in the post-init stage
-    draft_model_config: ModelConfig = field(default=None,
-                                            init=True)  # type: ignore
+    draft_model_config: SkipValidation[ModelConfig] = None  # type: ignore
     """The configuration of the draft model initialized internal."""
-    draft_parallel_config: ParallelConfig = field(default=None,
-                                                  init=True)  # type: ignore
+    draft_parallel_config: SkipValidation[
+        ParallelConfig] = None  # type: ignore
     """The parallel configuration for the draft model initialized internal."""
 
     def compute_hash(self) -> str:
@@ -2766,7 +2787,7 @@ LoRADType = Literal["auto", "float16", "bfloat16"]
 
 
 @config
-@dataclass
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class LoRAConfig:
     """Configuration for LoRA."""
 
@@ -2863,7 +2884,7 @@ class LoRAConfig:
 
 
 @config
-@dataclass
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class PromptAdapterConfig:
     """Configuration for PromptAdapters."""
 
@@ -3892,17 +3913,11 @@ class CompilationConfig:
             "pass_config",
             "traced_files",
         }
-        include = dict()
-        for k, v in asdict(self).items():
-            if k in exclude:
-                continue
-            f = get_field(CompilationConfig, k)
-            if (d := f.default) is not MISSING and d == v:
-                continue
-            if (df := f.default_factory) is not MISSING and df() == v:
-                continue
-            include[k] = v
-        return json.dumps(include)
+        # The cast to string is necessary because Pydantic is mocked in docs
+        # builds and sphinx-argparse doesn't know the return type of decode()
+        return str(
+            TypeAdapter(CompilationConfig).dump_json(
+                self, exclude=exclude, exclude_unset=True).decode())
 
     __str__ = __repr__
 
@@ -3911,7 +3926,7 @@ class CompilationConfig:
         """Parse the CLI value for the compilation config."""
         if cli_value in ["0", "1", "2", "3"]:
             return cls(level=int(cli_value))
-        return cls(**json.loads(cli_value))
+        return TypeAdapter(CompilationConfig).validate_json(cli_value)
 
     def __post_init__(self) -> None:
         count_none = self.custom_ops.count("none")
@@ -4037,7 +4052,7 @@ class CompilationConfig:
 
 
 @config
-@dataclass
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class VllmConfig:
     """Dataclass which contains all vllm-related configuration. This
     simplifies passing around the distinct configurations in the codebase.
@@ -4294,9 +4309,6 @@ class VllmConfig:
                 "To workaround this limitation, vLLM will set 'ieee' input "
                 "precision for chunked prefill triton kernels.")
 
-        if self.compilation_config is None:
-            self.compilation_config = CompilationConfig()
-
         # async tp is built on top of sequence parallelism
         # and requires it to be enabled.
         if self.compilation_config.pass_config.enable_async_tp:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 442e4100f..2a1a34211 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -14,6 +14,7 @@ from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional,
 
 import regex as re
 import torch
+from pydantic import SkipValidation, TypeAdapter, ValidationError
 from typing_extensions import TypeIs, deprecated
 
 import vllm.envs as envs
@@ -38,7 +39,7 @@ from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
-                        GiB_bytes, is_in_doc_build, is_in_ray_actor)
+                        GiB_bytes, is_in_ray_actor)
 
 # yapf: enable
 
@@ -156,7 +157,8 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
         # Get the set of possible types for the field
         type_hints: set[TypeHint] = set()
         if get_origin(field.type) in {Union, Annotated}:
-            type_hints.update(get_args(field.type))
+            predicate = lambda arg: not isinstance(arg, SkipValidation)
+            type_hints.update(filter(predicate, get_args(field.type)))
         else:
             type_hints.add(field.type)
 
@@ -168,10 +170,7 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
         if field.default is not MISSING:
             default = field.default
         elif field.default_factory is not MISSING:
-            if is_dataclass(field.default_factory) and is_in_doc_build():
-                default = {}
-            else:
-                default = field.default_factory()
+            default = field.default_factory()
 
         # Get the help text for the field
         name = field.name
@@ -189,12 +188,16 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
         - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n
         - `--json-arg.key1 value1 --json-arg.key2.key3 value2`\n\n"""
         if dataclass_cls is not None:
-            dataclass_init = lambda x, f=dataclass_cls: f(**json.loads(x))
-            # Special case for configs with a from_cli method
-            if hasattr(dataclass_cls, "from_cli"):
-                from_cli = dataclass_cls.from_cli
-                dataclass_init = lambda x, f=from_cli: f(x)
-            kwargs[name]["type"] = dataclass_init
+
+            def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
+                try:
+                    if hasattr(cls, "from_cli"):
+                        return cls.from_cli(val)
+                    return TypeAdapter(cls).validate_json(val)
+                except ValidationError as e:
+                    raise argparse.ArgumentTypeError(repr(e)) from e
+
+            kwargs[name]["type"] = parse_dataclass
             kwargs[name]["help"] += json_tip
         elif contains_type(type_hints, bool):
             # Creates --no-<name> and --<name> flags
@@ -225,12 +228,11 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
                 kwargs[name]["type"] = human_readable_int
         elif contains_type(type_hints, float):
             kwargs[name]["type"] = float
-        elif contains_type(type_hints,
-                           dict) and (contains_type(type_hints, str) or any(
-                               is_not_builtin(th) for th in type_hints)):
+        elif (contains_type(type_hints, dict)
+              and (contains_type(type_hints, str)
+                   or any(is_not_builtin(th) for th in type_hints))):
             kwargs[name]["type"] = union_dict_and_str
         elif contains_type(type_hints, dict):
-            # Dict arguments will always be optional
             kwargs[name]["type"] = parse_type(json.loads)
             kwargs[name]["help"] += json_tip
         elif (contains_type(type_hints, str)
@@ -317,8 +319,7 @@ class EngineArgs:
     rope_scaling: dict[str, Any] = get_field(ModelConfig, "rope_scaling")
     rope_theta: Optional[float] = ModelConfig.rope_theta
     hf_token: Optional[Union[bool, str]] = ModelConfig.hf_token
-    hf_overrides: Optional[HfOverrides] = \
-        get_field(ModelConfig, "hf_overrides")
+    hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
     tokenizer_revision: Optional[str] = ModelConfig.tokenizer_revision
     quantization: Optional[QuantizationMethods] = ModelConfig.quantization
     enforce_eager: bool = ModelConfig.enforce_eager
@@ -398,7 +399,8 @@ class EngineArgs:
         get_field(ModelConfig, "override_neuron_config")
     override_pooler_config: Optional[Union[dict, PoolerConfig]] = \
         ModelConfig.override_pooler_config
-    compilation_config: Optional[CompilationConfig] = None
+    compilation_config: CompilationConfig = \
+        get_field(VllmConfig, "compilation_config")
     worker_cls: str = ParallelConfig.worker_cls
     worker_extension_cls: str = ParallelConfig.worker_extension_cls
 
@@ -413,7 +415,8 @@ class EngineArgs:
 
     calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
 
-    additional_config: Optional[Dict[str, Any]] = None
+    additional_config: dict[str, Any] = \
+        get_field(VllmConfig, "additional_config")
     enable_reasoning: Optional[bool] = None  # DEPRECATED
     reasoning_parser: str = DecodingConfig.reasoning_backend
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 59cc44eb0..7e2e5161c 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -207,6 +207,9 @@ class LLM:
             if isinstance(worker_cls, type):
                 kwargs["worker_cls"] = cloudpickle.dumps(worker_cls)
 
+        if hf_overrides is None:
+            hf_overrides = {}
+
         if compilation_config is not None:
             if isinstance(compilation_config, int):
                 compilation_config_instance = CompilationConfig(
@@ -218,7 +221,7 @@ class LLM:
             else:
                 compilation_config_instance = compilation_config
         else:
-            compilation_config_instance = None
+            compilation_config_instance = CompilationConfig()
 
         engine_args = EngineArgs(
             model=model,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 393cf381b..a7f85e9ee 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -175,11 +175,15 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
     type: Literal["function"] = "function"
 
 
+# extra="forbid" is a workaround to have kwargs as a field,
+# see https://github.com/pydantic/pydantic/issues/3125
 class LogitsProcessorConstructor(BaseModel):
     qualname: str
     args: Optional[list[Any]] = None
     kwargs: Optional[dict[str, Any]] = None
 
+    model_config = ConfigDict(extra="forbid")
+
 
 LogitsProcessors = list[Union[str, LogitsProcessorConstructor]]
 
@@ -234,7 +238,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     presence_penalty: Optional[float] = 0.0
     response_format: Optional[AnyResponseFormat] = None
     seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
-    stop: Optional[Union[str, list[str]]] = Field(default_factory=list)
+    stop: Optional[Union[str, list[str]]] = []
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
     temperature: Optional[float] = None
@@ -258,7 +262,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     min_p: Optional[float] = None
     repetition_penalty: Optional[float] = None
     length_penalty: float = 1.0
-    stop_token_ids: Optional[list[int]] = Field(default_factory=list)
+    stop_token_ids: Optional[list[int]] = []
     include_stop_str_in_output: bool = False
     ignore_eos: bool = False
     min_tokens: int = 0
@@ -756,7 +760,7 @@ class CompletionRequest(OpenAIBaseModel):
     n: int = 1
     presence_penalty: Optional[float] = 0.0
     seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
-    stop: Optional[Union[str, list[str]]] = Field(default_factory=list)
+    stop: Optional[Union[str, list[str]]] = []
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
     suffix: Optional[str] = None
@@ -770,7 +774,7 @@ class CompletionRequest(OpenAIBaseModel):
     min_p: Optional[float] = None
     repetition_penalty: Optional[float] = None
     length_penalty: float = 1.0
-    stop_token_ids: Optional[list[int]] = Field(default_factory=list)
+    stop_token_ids: Optional[list[int]] = []
     include_stop_str_in_output: bool = False
     ignore_eos: bool = False
     min_tokens: int = 0
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index c73575b48..f96a4ac8b 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -134,11 +134,9 @@ class RequestProcessingMixin(BaseModel):
     Mixin for request processing, 
     handling prompt preparation and engine input.
     """
-    request_prompts: Optional[Sequence[RequestPrompt]] = \
-                            Field(default_factory=list)
+    request_prompts: Optional[Sequence[RequestPrompt]] = []
     engine_prompts: Optional[Union[list[EngineTokensPrompt],
-                                   list[EngineEmbedsPrompt]]] = Field(
-                                       default_factory=list)
+                                   list[EngineEmbedsPrompt]]] = []
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
@@ -528,12 +526,14 @@ class OpenAIServing:
         if isinstance(request,
                       (EmbeddingChatRequest, EmbeddingCompletionRequest,
                        ScoreRequest, RerankRequest, ClassificationRequest)):
-            operation = {
-                ScoreRequest: "score",
-                ClassificationRequest: "classification"
-            }.get(type(request), "embedding generation")
 
             if token_num > self.max_model_len:
+                operations: dict[type[AnyRequest], str] = {
+                    ScoreRequest: "score",
+                    ClassificationRequest: "classification"
+                }
+                operation = operations.get(type(request),
+                                           "embedding generation")
                 raise ValueError(
                     f"This model's maximum context length is "
                     f"{self.max_model_len} tokens. However, you requested "
diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py
index 085f37a5d..316860718 100644
--- a/vllm/model_executor/guided_decoding/guided_fields.py
+++ b/vllm/model_executor/guided_decoding/guided_fields.py
@@ -3,12 +3,10 @@
 from dataclasses import dataclass
 from typing import Optional, TypedDict, Union
 
-from pydantic import BaseModel
-
 
 # These classes are deprecated, see SamplingParams
 class LLMGuidedOptions(TypedDict, total=False):
-    guided_json: Union[dict, BaseModel, str]
+    guided_json: Union[dict, str]
     guided_regex: str
     guided_choice: list[str]
     guided_grammar: str
@@ -20,7 +18,7 @@ class LLMGuidedOptions(TypedDict, total=False):
 @dataclass
 class GuidedDecodingRequest:
     """One of the fields will be used to retrieve the logit processor."""
-    guided_json: Optional[Union[dict, BaseModel, str]] = None
+    guided_json: Optional[Union[dict, str]] = None
     guided_regex: Optional[str] = None
     guided_choice: Optional[list[str]] = None
     guided_grammar: Optional[str] = None
diff --git a/vllm/utils.py b/vllm/utils.py
index 846df7743..c1213d463 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1878,14 +1878,6 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
     return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
 
 
-def is_in_doc_build() -> bool:
-    try:
-        from sphinx.ext.autodoc.mock import _MockModule
-        return isinstance(zmq, _MockModule)
-    except ModuleNotFoundError:
-        return False
-
-
 def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
     """
     Import a Python file according to its file path.
-- 
GitLab


From 435fa95444b2b0e408dc4b51ce7e759bbb3d0bcc Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Wed, 28 May 2025 22:08:57 +0800
Subject: [PATCH 021/274] [Frontend] add run batch to CLI (#18804)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 .../offline_inference/openai_batch/README.md  | 37 ++++++++++++-
 tests/entrypoints/openai/test_run_batch.py    | 26 ++++-----
 vllm/entrypoints/cli/main.py                  |  2 +
 vllm/entrypoints/cli/run_batch.py             | 55 +++++++++++++++++++
 vllm/entrypoints/openai/run_batch.py          | 12 ++--
 5 files changed, 110 insertions(+), 22 deletions(-)
 create mode 100644 vllm/entrypoints/cli/run_batch.py

diff --git a/examples/offline_inference/openai_batch/README.md b/examples/offline_inference/openai_batch/README.md
index 42a19f71e..ce7529782 100644
--- a/examples/offline_inference/openai_batch/README.md
+++ b/examples/offline_inference/openai_batch/README.md
@@ -48,7 +48,19 @@ The batch running tool is designed to be used from the command line.
 You can run the batch with the following command, which will write its results to a file called `results.jsonl`
 
 ```console
-python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch \
+    -i offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
+or use command-line:
+
+```console
+vllm run-batch \
+    -i offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ### Step 3: Check your results
@@ -68,7 +80,19 @@ The batch runner supports remote input and output urls that are accessible via h
 For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run
 
 ```console
-python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch \
+    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
+or use command-line:
+
+```console
+vllm run-batch \
+    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ## Example 3: Integrating with AWS S3
@@ -164,6 +188,15 @@ python -m vllm.entrypoints.openai.run_batch \
     --model --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
+or use command-line:
+
+```console
+vllm run-batch \
+    -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
+    -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
+    --model --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
 ### Step 4: View your results
 
 Your results are now on S3. You can view them in your terminal by running
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index 643d0d06a..27802945a 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -2,7 +2,6 @@
 
 import json
 import subprocess
-import sys
 import tempfile
 
 from vllm.entrypoints.openai.protocol import BatchRequestOutput
@@ -35,9 +34,8 @@ def test_empty_file():
         input_file.write("")
         input_file.flush()
         proc = subprocess.Popen([
-            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
-            input_file.name, "-o", output_file.name, "--model",
-            "intfloat/multilingual-e5-small"
+            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
+            "--model", "intfloat/multilingual-e5-small"
         ], )
         proc.communicate()
         proc.wait()
@@ -54,9 +52,8 @@ def test_completions():
         input_file.write(INPUT_BATCH)
         input_file.flush()
         proc = subprocess.Popen([
-            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
-            input_file.name, "-o", output_file.name, "--model",
-            "NousResearch/Meta-Llama-3-8B-Instruct"
+            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
+            "--model", "NousResearch/Meta-Llama-3-8B-Instruct"
         ], )
         proc.communicate()
         proc.wait()
@@ -79,9 +76,8 @@ def test_completions_invalid_input():
         input_file.write(INVALID_INPUT_BATCH)
         input_file.flush()
         proc = subprocess.Popen([
-            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
-            input_file.name, "-o", output_file.name, "--model",
-            "NousResearch/Meta-Llama-3-8B-Instruct"
+            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
+            "--model", "NousResearch/Meta-Llama-3-8B-Instruct"
         ], )
         proc.communicate()
         proc.wait()
@@ -95,9 +91,8 @@ def test_embeddings():
         input_file.write(INPUT_EMBEDDING_BATCH)
         input_file.flush()
         proc = subprocess.Popen([
-            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
-            input_file.name, "-o", output_file.name, "--model",
-            "intfloat/multilingual-e5-small"
+            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
+            "--model", "intfloat/multilingual-e5-small"
         ], )
         proc.communicate()
         proc.wait()
@@ -117,9 +112,8 @@ def test_score():
         input_file.write(INPUT_SCORE_BATCH)
         input_file.flush()
         proc = subprocess.Popen([
-            sys.executable,
-            "-m",
-            "vllm.entrypoints.openai.run_batch",
+            "vllm",
+            "run-batch",
             "-i",
             input_file.name,
             "-o",
diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py
index 6676c294c..5eba72fec 100644
--- a/vllm/entrypoints/cli/main.py
+++ b/vllm/entrypoints/cli/main.py
@@ -7,6 +7,7 @@ import sys
 import vllm.entrypoints.cli.benchmark.main
 import vllm.entrypoints.cli.collect_env
 import vllm.entrypoints.cli.openai
+import vllm.entrypoints.cli.run_batch
 import vllm.entrypoints.cli.serve
 import vllm.version
 from vllm.entrypoints.utils import VLLM_SERVE_PARSER_EPILOG, cli_env_setup
@@ -17,6 +18,7 @@ CMD_MODULES = [
     vllm.entrypoints.cli.serve,
     vllm.entrypoints.cli.benchmark.main,
     vllm.entrypoints.cli.collect_env,
+    vllm.entrypoints.cli.run_batch,
 ]
 
 
diff --git a/vllm/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py
new file mode 100644
index 000000000..f74c8da9b
--- /dev/null
+++ b/vllm/entrypoints/cli/run_batch.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import asyncio
+
+from prometheus_client import start_http_server
+
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.entrypoints.logger import logger
+from vllm.entrypoints.openai.run_batch import main as run_batch_main
+from vllm.entrypoints.openai.run_batch import make_arg_parser
+from vllm.utils import FlexibleArgumentParser
+from vllm.version import __version__ as VLLM_VERSION
+
+
+class RunBatchSubcommand(CLISubcommand):
+    """The `run-batch` subcommand for vLLM CLI."""
+
+    def __init__(self):
+        self.name = "run-batch"
+        super().__init__()
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        logger.info("vLLM batch processing API version %s", VLLM_VERSION)
+        logger.info("args: %s", args)
+
+        # Start the Prometheus metrics server.
+        # LLMEngine uses the Prometheus client
+        # to publish metrics at the /metrics endpoint.
+        if args.enable_metrics:
+            logger.info("Prometheus metrics enabled")
+            start_http_server(port=args.port, addr=args.url)
+        else:
+            logger.info("Prometheus metrics disabled")
+
+        asyncio.run(run_batch_main(args))
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        run_batch_parser = subparsers.add_parser(
+            "run-batch",
+            help="Run batch prompts and write results to file.",
+            description=(
+                "Run batch prompts using vLLM's OpenAI-compatible API.\n"
+                "Supports local or HTTP input/output files."),
+            usage=
+            "vllm run-batch -i INPUT.jsonl -o OUTPUT.jsonl --model <model>",
+        )
+        return make_arg_parser(run_batch_parser)
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [RunBatchSubcommand()]
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index eae83c9a4..f38465b22 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -33,9 +33,7 @@ from vllm.utils import FlexibleArgumentParser, random_uuid
 from vllm.version import __version__ as VLLM_VERSION
 
 
-def parse_args():
-    parser = FlexibleArgumentParser(
-        description="vLLM OpenAI-Compatible batch runner.")
+def make_arg_parser(parser: FlexibleArgumentParser):
     parser.add_argument(
         "-i",
         "--input-file",
@@ -98,7 +96,13 @@ def parse_args():
         default=False,
         help="If set to True, enable prompt_tokens_details in usage.")
 
-    return parser.parse_args()
+    return parser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="vLLM OpenAI-Compatible batch runner.")
+    return make_arg_parser(parser).parse_args()
 
 
 # explicitly use pure text format, with a newline at the end
-- 
GitLab


From 6e4cea1cc56da6d4a558a50196c5698c36385890 Mon Sep 17 00:00:00 2001
From: daniel-salib <danielsalib@meta.com>
Date: Wed, 28 May 2025 07:15:12 -0700
Subject: [PATCH 022/274] decrement server_load on listen for disconnect
 (#18784)

Signed-off-by: Daniel Salib <danielsalib@meta.com>
---
 vllm/entrypoints/utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index cc651a172..1b0ea6909 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -26,6 +26,11 @@ async def listen_for_disconnect(request: Request) -> None:
     while True:
         message = await request.receive()
         if message["type"] == "http.disconnect":
+            if request.app.state.enable_server_load_tracking:
+                # on timeout/cancellation the BackgroundTask in load_aware_call
+                # cannot decrement the server load metrics.
+                # Must be decremented by with_cancellation instead.
+                request.app.state.server_load_metrics -= 1
             break
 
 
-- 
GitLab


From 321331b8ae41f13e519a63f99a0c427dc3907126 Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Wed, 28 May 2025 09:58:24 -0600
Subject: [PATCH 023/274] [Core] Add Lora Support to Beam Search (#18346)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
---
 .../entrypoints/openai/test_lora_adapters.py  | 34 ++++++++++
 tests/lora/test_qwen2vl.py                    | 62 ++++++++++++++++++-
 vllm/beam_search.py                           |  4 ++
 vllm/engine/protocol.py                       | 22 ++++---
 vllm/entrypoints/llm.py                       | 42 +++++++++++--
 vllm/entrypoints/openai/serving_chat.py       |  1 +
 vllm/entrypoints/openai/serving_completion.py |  1 +
 7 files changed, 150 insertions(+), 16 deletions(-)

diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index 2fc08b475..cd07ca46c 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -313,3 +313,37 @@ async def test_loading_invalid_adapters_does_not_break_others(
         prompt=["Hello there", "Foo bar bazz buzz"],
         max_tokens=5,
     )
+
+
+@pytest.mark.asyncio
+async def test_beam_search_with_lora_adapters(
+    client: openai.AsyncOpenAI,
+    tmp_path,
+    zephyr_lora_files,
+):
+    """Validate that async beam search can be used with lora."""
+
+    async def load_and_run_adapter(adapter_name: str):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": adapter_name,
+                              "lora_path": str(zephyr_lora_files)
+                          })
+        for _ in range(3):
+            await client.completions.create(
+                model=adapter_name,
+                prompt=["Hello there", "Foo bar bazz buzz"],
+                max_tokens=5,
+                extra_body=dict(use_beam_search=True),
+            )
+
+    lora_tasks = []
+    for i in range(3):
+        lora_tasks.append(
+            asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
+
+    results, _ = await asyncio.wait(lora_tasks)
+
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 7bd3e3d0f..162714df2 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -10,6 +10,7 @@ import vllm
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform
+from vllm.sampling_params import BeamSearchParams
 
 
 @pytest.fixture(autouse=not current_platform.is_cpu())
@@ -69,7 +70,7 @@ class Qwen2VLTester:
                  expected_outputs: list[str],
                  lora_id: Optional[int] = None,
                  temperature: float = 0,
-                 max_tokens: int = 5) -> list[str]:
+                 max_tokens: int = 5):
 
         sampling_params = vllm.SamplingParams(
             temperature=temperature,
@@ -97,7 +98,35 @@ class Qwen2VLTester:
                 generated), f"Generated text {generated} doesn't "
             f"match expected pattern {expected}"
 
-        return generated_texts
+    def run_beam_search_test(self,
+                             images: list[ImageAsset],
+                             expected_outputs: list[list[str]],
+                             lora_id: Optional[int] = None,
+                             temperature: float = 0,
+                             beam_width: int = 2,
+                             max_tokens: int = 5):
+
+        beam_search_params = BeamSearchParams(beam_width=beam_width,
+                                              max_tokens=max_tokens,
+                                              temperature=temperature)
+
+        inputs = [{
+            "prompt": self.PROMPT_TEMPLATE,
+            "multi_modal_data": {
+                "image": asset.pil_image
+            },
+        } for asset in images]
+
+        lora_request = LoRARequest(str(lora_id), lora_id,
+                                   self.config.lora_path)
+        outputs = self.llm.beam_search(inputs,
+                                       beam_search_params,
+                                       lora_request=lora_request)
+
+        for output_obj, expected_outs in zip(outputs, expected_outputs):
+            output_texts = [seq.text for seq in output_obj.sequences]
+            assert output_texts == expected_outs, \
+                f"Generated texts {output_texts} do not match expected {expected_outs}"  # noqa: E501
 
 
 TEST_IMAGES = [
@@ -110,6 +139,14 @@ EXPECTED_OUTPUTS = [
     "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.",  # noqa: E501
 ]
 
+# NOTE - beam search .text contains the whole text
+EXPECTED_BEAM_SEARCH_OUTPUTS = [
+    [
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic skyscraper stands",  # noqa: E501
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic tower stands tall",  # noqa: E501
+    ],
+]
+
 QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
 QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
 
@@ -130,6 +167,27 @@ def test_qwen2vl_lora(qwen2vl_lora_files):
                         lora_id=lora_id)
 
 
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="Qwen2-VL dependency xformers incompatible with ROCm")
+def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
+    """Test Qwen 2.0 VL model with LoRA through beam search."""
+    config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
+                        lora_path=qwen2vl_lora_files)
+    tester = Qwen2VLTester(config)
+
+    # Test with different LoRA IDs
+    for lora_id in [1, 2]:
+        # NOTE currently, we only test cherry blossom since stop sign
+        # output is slightly different for v1; - the root cause is likely
+        # independent of the intent of this test, which is to ensure beam
+        # search passes through lora through correctly.
+        tester.run_beam_search_test(
+            [ImageAsset("cherry_blossom")],
+            expected_outputs=EXPECTED_BEAM_SEARCH_OUTPUTS,
+            lora_id=lora_id)
+
+
 @pytest.mark.xfail(
     current_platform.is_rocm(),
     reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
diff --git a/vllm/beam_search.py b/vllm/beam_search.py
index 967510aba..ddacc6695 100644
--- a/vllm/beam_search.py
+++ b/vllm/beam_search.py
@@ -3,6 +3,7 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional, Union
 
+from vllm.lora.request import LoRARequest
 from vllm.sequence import Logprob
 
 if TYPE_CHECKING:
@@ -19,6 +20,7 @@ class BeamSearchSequence:
     # The tokens includes the prompt.
     tokens: list[int]
     logprobs: list[dict[int, Logprob]]
+    lora_request: Optional[LoRARequest] = None
     cum_logprob: float = 0.0
     text: Optional[str] = None
     finish_reason: Optional[str] = None
@@ -41,6 +43,7 @@ class BeamSearchInstance:
     def __init__(
         self,
         prompt_tokens: list[int],
+        lora_request: Optional[LoRARequest] = None,
         logprobs: Optional[list[dict[int, Logprob]]] = None,
         **kwargs,
     ):
@@ -48,6 +51,7 @@ class BeamSearchInstance:
             BeamSearchSequence(
                 tokens=prompt_tokens,
                 logprobs=[] if logprobs is None else list(logprobs),
+                lora_request=lora_request,
                 **kwargs,
             )
         ]
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index a837a2d28..28341c2c6 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -65,6 +65,7 @@ class EngineClient(ABC):
         prompt: PromptType,
         request_id: str,
         params: BeamSearchParams,
+        lora_request: Optional[LoRARequest] = None,
     ) -> AsyncGenerator[RequestOutput, None]:
 
         beam_width = params.beam_width
@@ -106,27 +107,31 @@ class EngineClient(ABC):
                                cum_logprob=0,
                                logprobs=[],
                                multi_modal_data=multi_modal_data,
-                               mm_processor_kwargs=mm_processor_kwargs)
+                               mm_processor_kwargs=mm_processor_kwargs,
+                               lora_request=lora_request)
         ]
         completed = []
 
         for _ in range(max_tokens):
-            prompts_batch = [
+            prompts_batch, lora_req_batch = zip(*[(
                 TokensPrompt(prompt_token_ids=beam.tokens,
                              multi_modal_data=beam.multi_modal_data,
-                             mm_processor_kwargs=beam.mm_processor_kwargs)
-                for beam in all_beams
-            ]
+                             mm_processor_kwargs=beam.mm_processor_kwargs),
+                beam.lora_request,
+            ) for beam in all_beams])
 
             tasks = []
 
             request_id = f"beam_search-{random_uuid()}"
-            for i, individual_prompt in enumerate(prompts_batch):
+            for i, (individual_prompt,
+                    lora_req) in enumerate(zip(prompts_batch, lora_req_batch)):
                 request_id_item = f"{request_id}-{i}"
                 task = asyncio.create_task(
                     collect_from_async_generator(
-                        self.generate(individual_prompt, beam_search_params,
-                                      request_id_item)))
+                        self.generate(individual_prompt,
+                                      beam_search_params,
+                                      request_id_item,
+                                      lora_request=lora_req)))
                 tasks.append(task)
 
             output = await asyncio.gather(*tasks)
@@ -159,6 +164,7 @@ class EngineClient(ABC):
                                     tokens=current_beam.tokens + [token_id],
                                     logprobs=current_beam.logprobs +
                                     [logprobs],
+                                    lora_request=current_beam.lora_request,
                                     cum_logprob=current_beam.cum_logprob +
                                     logprob_obj.logprob,
                                     multi_modal_data=current_beam.
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 7e2e5161c..f8eeae61f 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -522,10 +522,28 @@ class LLM:
         executor = self.llm_engine.model_executor
         return executor.apply_model(func)
 
+    def _get_beam_search_lora_requests(
+        self,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]],
+        prompts: list[Union[TokensPrompt, TextPrompt]],
+    ) -> list[Optional[LoRARequest]]:
+        """Get the optional lora request corresponding to each prompt."""
+        if isinstance(lora_request,
+                      Sequence) and len(lora_request) != len(prompts):
+            raise ValueError(
+                "Lora request list should be the same length as the prompts")
+            return lora_request
+
+        if lora_request is None or isinstance(lora_request, LoRARequest):
+            return [lora_request] * len(prompts)
+
+        raise TypeError(f"Invalid lora_request type {type(lora_request)}")
+
     def beam_search(
         self,
         prompts: list[Union[TokensPrompt, TextPrompt]],
         params: BeamSearchParams,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
     ) -> list[BeamSearchOutput]:
         """
         Generate sequences using beam search.
@@ -534,6 +552,7 @@ class LLM:
             prompts: A list of prompts. Each prompt can be a string or a list
                 of token IDs.
             params: The beam search parameters.
+            lora_request: LoRA request to use for generation, if any.
         """
         # TODO: how does beam search work together with length penalty,
         # frequency, penalty, and stopping criteria, etc.?
@@ -543,6 +562,9 @@ class LLM:
         ignore_eos = params.ignore_eos
         length_penalty = params.length_penalty
 
+        lora_requests = self._get_beam_search_lora_requests(
+            lora_request, prompts)
+
         def sort_beams_key(x: BeamSearchSequence) -> float:
             return get_beam_search_score(x.tokens, x.cum_logprob,
                                          tokenizer.eos_token_id,
@@ -570,7 +592,7 @@ class LLM:
                                             temperature=temperature)
         instances: list[BeamSearchInstance] = []
 
-        for prompt in prompts:
+        for lora_req, prompt in zip(lora_requests, prompts):
             # Add multimodal processor kwargs & data
             mm_kwargs = {}
             if "multi_modal_data" in prompt:
@@ -586,7 +608,12 @@ class LLM:
                 prompt_tokens = tokenizer.encode(prompt["prompt"])
 
             instances.append(
-                BeamSearchInstance(prompt_tokens, logprobs=None, **mm_kwargs))
+                BeamSearchInstance(
+                    prompt_tokens,
+                    lora_request=lora_req,
+                    logprobs=None,
+                    **mm_kwargs,
+                ), )
 
         for _ in range(max_tokens):
             all_beams: list[BeamSearchSequence] = list(
@@ -600,15 +627,17 @@ class LLM:
             if len(all_beams) == 0:
                 break
 
-            prompts_batch = [
-                create_tokens_prompt_from_beam(beam) for beam in all_beams
-            ]
+            # create the corresponding batch entries for prompt & optional lora
+            prompts_batch, lora_req_batch = zip(
+                *[(create_tokens_prompt_from_beam(beam), beam.lora_request)
+                  for beam in all_beams])
 
             # only runs for one step
             # we don't need to use tqdm here
             output = self.generate(prompts_batch,
                                    sampling_params=beam_search_params,
-                                   use_tqdm=False)
+                                   use_tqdm=False,
+                                   lora_request=lora_req_batch)
 
             for (start, end), instance in zip(instance_start_and_end,
                                               instances):
@@ -626,6 +655,7 @@ class LLM:
                             new_beam = BeamSearchSequence(
                                 tokens=current_beam.tokens + [token_id],
                                 logprobs=current_beam.logprobs + [logprobs],
+                                lora_request=current_beam.lora_request,
                                 cum_logprob=current_beam.cum_logprob +
                                 logprob_obj.logprob,
                                 multi_modal_data=current_beam.multi_modal_data,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index bc11686d7..6a0e3b14d 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -236,6 +236,7 @@ class OpenAIServingChat(OpenAIServing):
                         prompt=engine_prompt,
                         request_id=request_id,
                         params=sampling_params,
+                        lora_request=lora_request,
                     )
                 else:
                     generator = self.engine_client.generate(
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 7beaae287..1c06070cb 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -186,6 +186,7 @@ class OpenAIServingCompletion(OpenAIServing):
                         prompt=engine_prompt,
                         request_id=request_id,
                         params=sampling_params,
+                        lora_request=lora_request,
                     )
                 else:
                     generator = self.engine_client.generate(
-- 
GitLab


From fced756923e3b65dad0e1f4da202c4d2f82d2a4b Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Wed, 28 May 2025 11:59:11 -0400
Subject: [PATCH 024/274] [Chore] update ty configuration (#18839)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index eb55a9ffc..5286724b5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -172,7 +172,8 @@ plugins.md033.enabled = false # inline-html
 plugins.md046.enabled = false # code-block-style
 plugins.md024.allow_different_nesting = true # no-duplicate-headers
 
-[tool.ty]
+[tool.ty.src]
+root = "./vllm"
 respect-ignore-files = true
 
 [tool.ty.environment]
-- 
GitLab


From c68b5c63eba0ecccda8ab34ca1829cb4ced2c430 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Thu, 29 May 2025 01:36:21 +0800
Subject: [PATCH 025/274] [Misc] fix olmoe model layer can't laod in tp gt 1
 (#18828)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 vllm/model_executor/models/olmoe.py | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 6364b89fb..af2894555 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Inference-only OLMoE model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from functools import partial
 from typing import Any, Optional, Union
 
 import torch
@@ -22,7 +23,10 @@ from transformers import PretrainedConfig
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_gather)
+from vllm.distributed.utils import split_tensor_along_last_dim
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -140,8 +144,11 @@ class OlmoeAttention(nn.Module):
             bias=False,
             quant_config=quant_config,
         )
-        self.q_norm = RMSNorm(hidden_size, eps=1e-5)
-        self.k_norm = RMSNorm(hidden_size, eps=1e-5)
+        self.tp_size = tp_size
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.q_norm = RMSNorm(self.total_num_heads * self.head_dim, eps=1e-5)
+        self.k_norm = RMSNorm(self.total_num_kv_heads * self.head_dim,
+                              eps=1e-5)
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
@@ -165,6 +172,20 @@ class OlmoeAttention(nn.Module):
                               quant_config=quant_config,
                               prefix=f"{prefix}.attn")
 
+    def _apply_qk_norm(self, q: torch.Tensor,
+                       k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.tp_size > 1:
+            q = tensor_model_parallel_all_gather(q.contiguous())
+            k = tensor_model_parallel_all_gather(k.contiguous())
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        if self.tp_size > 1:
+            splitter = partial(split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+        return q, k
+
     def forward(
         self,
         positions: torch.Tensor,
@@ -172,7 +193,7 @@ class OlmoeAttention(nn.Module):
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.q_norm(q.contiguous()), self.k_norm(k.contiguous())
+        q, k = self._apply_qk_norm(q, k)
         q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
-- 
GitLab


From 0e98964e94385c914783842eac4b448236018689 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Wed, 28 May 2025 19:54:12 +0100
Subject: [PATCH 026/274] [V1][Metrics] Remove metrics that were deprecated in
 0.8 (#18837)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 docs/usage/metrics.md                         | 13 ---
 .../prometheus_grafana/grafana.json           | 30 -------
 tests/entrypoints/openai/test_metrics.py      |  7 +-
 vllm/engine/llm_engine.py                     | 15 ----
 vllm/engine/metrics.py                        | 89 -------------------
 vllm/engine/metrics_types.py                  |  3 -
 6 files changed, 1 insertion(+), 156 deletions(-)

diff --git a/docs/usage/metrics.md b/docs/usage/metrics.md
index 9ad725318..6603aa83b 100644
--- a/docs/usage/metrics.md
+++ b/docs/usage/metrics.md
@@ -35,19 +35,6 @@ The following metrics are exposed:
 --8<-- "vllm/engine/metrics.py:metrics-definitions"
 ```
 
-The following metrics are deprecated and due to be removed in a future version:
-
-- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and
-  `vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not
-  used in V1.
-- `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits
-  counters in V1.
-- `vllm:time_in_queue_requests` because it duplicates
-  `vllm:request_queue_time_seconds`.
-- `vllm:model_forward_time_milliseconds` and
-  `vllm:model_execute_time_milliseconds` because
-  prefill/decode/inference time metrics should be used instead.
-
 Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
 but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
 and are then removed in version `X.Y+2`.
diff --git a/examples/online_serving/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json
index fbe96b48e..3488956a5 100644
--- a/examples/online_serving/prometheus_grafana/grafana.json
+++ b/examples/online_serving/prometheus_grafana/grafana.json
@@ -577,23 +577,6 @@
           "refId": "A",
           "useBackend": false
         },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}",
-          "fullMetaSearch": false,
-          "hide": false,
-          "includeNullMetadata": true,
-          "instant": false,
-          "legendFormat": "Num Swapped",
-          "range": true,
-          "refId": "B",
-          "useBackend": false
-        },
         {
           "datasource": {
             "type": "prometheus",
@@ -874,19 +857,6 @@
           "legendFormat": "GPU Cache Usage",
           "range": true,
           "refId": "A"
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "editorMode": "code",
-          "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}",
-          "hide": false,
-          "instant": false,
-          "legendFormat": "CPU Cache Usage",
-          "range": true,
-          "refId": "B"
         }
       ],
       "title": "Cache Utilization",
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 42f7b098f..b21c0173c 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -171,10 +171,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 
 EXPECTED_METRICS = [
     "vllm:num_requests_running",
-    "vllm:num_requests_swapped",  # deprecated
     "vllm:num_requests_waiting",
     "vllm:gpu_cache_usage_perc",
-    "vllm:cpu_cache_usage_perc",  # deprecated
     "vllm:time_to_first_token_seconds_sum",
     "vllm:time_to_first_token_seconds_bucket",
     "vllm:time_to_first_token_seconds_count",
@@ -274,10 +272,7 @@ EXPECTED_METRICS_V1 = [
     "vllm:request_decode_time_seconds_count",
 ]
 
-HIDDEN_DEPRECATED_METRICS = [
-    "vllm:num_requests_swapped",
-    "vllm:cpu_cache_usage_perc",
-]
+HIDDEN_DEPRECATED_METRICS: list[str] = []
 
 
 @pytest.mark.asyncio
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index ff33d566a..a9600a2c8 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1680,9 +1680,6 @@ class LLMEngine:
         time_inference_requests: List[float] = []
         time_prefill_requests: List[float] = []
         time_decode_requests: List[float] = []
-        time_in_queue_requests: List[float] = []
-        model_forward_time_requests: List[float] = []
-        model_execute_time_requests: List[float] = []
         #   Metadata
         num_prompt_tokens_requests: List[int] = []
         num_generation_tokens_requests: List[int] = []
@@ -1790,15 +1787,6 @@ class LLMEngine:
                             now - seq_group.metrics.first_token_time)
                         time_inference_requests.append(
                             now - seq_group.metrics.first_scheduled_time)
-                    if seq_group.metrics.time_in_queue is not None:
-                        time_in_queue_requests.append(
-                            seq_group.metrics.time_in_queue)
-                    if seq_group.metrics.model_forward_time is not None:
-                        model_forward_time_requests.append(
-                            seq_group.metrics.model_forward_time)
-                    if seq_group.metrics.model_execute_time is not None:
-                        model_execute_time_requests.append(
-                            seq_group.metrics.model_execute_time * 1000)
                     # Metadata
                     num_prompt_tokens_requests.append(
                         len(seq_group.prompt_token_ids))
@@ -1867,9 +1855,6 @@ class LLMEngine:
             time_inference_requests=time_inference_requests,
             time_prefill_requests=time_prefill_requests,
             time_decode_requests=time_decode_requests,
-            time_in_queue_requests=time_in_queue_requests,
-            model_forward_time_requests=model_forward_time_requests,
-            model_execute_time_requests=model_execute_time_requests,
             #   Metadata
             num_prompt_tokens_requests=num_prompt_tokens_requests,
             num_generation_tokens_requests=num_generation_tokens_requests,
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 34b48f83b..916afe0c8 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -80,17 +80,6 @@ class Metrics:
             multiprocess_mode="livemostrecent",
         )
 
-        # Deprecated in 0.8 - KV cache offloading is not used in V1
-        # Hidden in 0.9, due to be removed in 0.10
-        if self.show_hidden_metrics:
-            self.gauge_scheduler_swapped = self._gauge_cls(
-                name="vllm:num_requests_swapped",
-                documentation=(
-                    "Number of requests swapped to CPU. "
-                    "DEPRECATED: KV cache offloading is not used in V1"),
-                labelnames=labelnames,
-                multiprocess_mode="sum")
-
         #   KV Cache Usage in %
         self.gauge_gpu_cache_usage = self._gauge_cls(
             name="vllm:gpu_cache_usage_perc",
@@ -98,35 +87,6 @@ class Metrics:
             labelnames=labelnames,
             multiprocess_mode="sum")
 
-        # Deprecated in 0.8 - KV cache offloading is not used in V1
-        # Hidden in 0.9, due to be removed in 0.10
-        if self.show_hidden_metrics:
-            self.gauge_cpu_cache_usage = self._gauge_cls(
-                name="vllm:cpu_cache_usage_perc",
-                documentation=(
-                    "CPU KV-cache usage. 1 means 100 percent usage. "
-                    "DEPRECATED: KV cache offloading is not used in V1"),
-                labelnames=labelnames,
-                multiprocess_mode="sum")
-            self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
-                name="vllm:cpu_prefix_cache_hit_rate",
-                documentation=(
-                    "CPU prefix cache block hit rate. "
-                    "DEPRECATED: KV cache offloading is not used in V1"),
-                labelnames=labelnames,
-                multiprocess_mode="sum")
-
-        # Deprecated in 0.8 - replaced by queries+hits counters in V1
-        # Hidden in 0.9, due to be removed in 0.10
-        if self.show_hidden_metrics:
-            self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
-                name="vllm:gpu_prefix_cache_hit_rate",
-                documentation=("GPU prefix cache block hit rate. "
-                               "DEPRECATED: use vllm:gpu_prefix_cache_queries "
-                               "and vllm:gpu_prefix_cache_queries in V1"),
-                labelnames=labelnames,
-                multiprocess_mode="sum")
-
         # Iteration stats
         self.counter_num_preemption = self._counter_cls(
             name="vllm:num_preemptions_total",
@@ -200,36 +160,6 @@ class Metrics:
             "Histogram of time spent in DECODE phase for request.",
             labelnames=labelnames,
             buckets=request_latency_buckets)
-        # Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
-        # Hidden in 0.9, due to be removed in 0.10
-        if self.show_hidden_metrics:
-            self.histogram_time_in_queue_request = self._histogram_cls(
-                name="vllm:time_in_queue_requests",
-                documentation=
-                ("Histogram of time the request spent in the queue in seconds. "
-                 "DEPRECATED: use vllm:request_queue_time_seconds instead."),
-                labelnames=labelnames,
-                buckets=request_latency_buckets)
-
-        # Deprecated in 0.8 - use prefill/decode/inference time metrics
-        # Hidden in 0.9, due to be removed in 0.10
-        if self.show_hidden_metrics:
-            self.histogram_model_forward_time_request = self._histogram_cls(
-                name="vllm:model_forward_time_milliseconds",
-                documentation=
-                ("Histogram of time spent in the model forward pass in ms. "
-                 "DEPRECATED: use prefill/decode/inference time metrics instead"
-                 ),
-                labelnames=labelnames,
-                buckets=build_1_2_3_5_8_buckets(3000))
-            self.histogram_model_execute_time_request = self._histogram_cls(
-                name="vllm:model_execute_time_milliseconds",
-                documentation=
-                ("Histogram of time spent in the model execute function in ms."
-                 "DEPRECATED: use prefill/decode/inference time metrics instead"
-                 ),
-                labelnames=labelnames,
-                buckets=build_1_2_3_5_8_buckets(3000))
 
         #   Metadata
         self.histogram_num_prompt_tokens_request = self._histogram_cls(
@@ -580,20 +510,10 @@ class PrometheusStatLogger(StatLoggerBase):
         # System state data
         self._log_gauge(self.metrics.gauge_scheduler_running,
                         stats.num_running_sys)
-        if self.metrics.show_hidden_metrics:
-            self._log_gauge(self.metrics.gauge_scheduler_swapped,
-                            stats.num_swapped_sys)
         self._log_gauge(self.metrics.gauge_scheduler_waiting,
                         stats.num_waiting_sys)
         self._log_gauge(self.metrics.gauge_gpu_cache_usage,
                         stats.gpu_cache_usage_sys)
-        if self.metrics.show_hidden_metrics:
-            self._log_gauge(self.metrics.gauge_cpu_cache_usage,
-                            stats.cpu_cache_usage_sys)
-            self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
-                            stats.cpu_prefix_cache_hit_rate)
-            self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
-                            stats.gpu_prefix_cache_hit_rate)
         # Including max-lora in metric, in future this property of lora
         # config maybe extended to be dynamic.
         lora_info = {
@@ -631,15 +551,6 @@ class PrometheusStatLogger(StatLoggerBase):
                             stats.time_prefill_requests)
         self._log_histogram(self.metrics.histogram_decode_time_request,
                             stats.time_decode_requests)
-        if self.metrics.show_hidden_metrics:
-            self._log_histogram(self.metrics.histogram_time_in_queue_request,
-                                stats.time_in_queue_requests)
-            self._log_histogram(
-                self.metrics.histogram_model_forward_time_request,
-                stats.model_forward_time_requests)
-            self._log_histogram(
-                self.metrics.histogram_model_execute_time_request,
-                stats.model_execute_time_requests)
         # Metadata
         finished_reason_counter = CollectionsCounter(
             stats.finished_reason_requests)
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
index 9e6d5ef29..acc83011d 100644
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -53,9 +53,6 @@ class Stats:
     time_inference_requests: List[float]
     time_prefill_requests: List[float]
     time_decode_requests: List[float]
-    time_in_queue_requests: List[float]
-    model_forward_time_requests: List[float]
-    model_execute_time_requests: List[float]
     #   Metadata
     num_prompt_tokens_requests: List[int]
     num_generation_tokens_requests: List[int]
-- 
GitLab


From a09c7ca9f2c01325eb46e82272c55358e5f98e8e Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Wed, 28 May 2025 14:57:19 -0400
Subject: [PATCH 027/274] [Chore][Spec Decode] Update check NoneType instead of
 assigning variables (#18836)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
---
 vllm/v1/worker/gpu_model_runner.py | 50 ++++++++++++++----------------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 910c0e80b..5d5558162 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -146,31 +146,27 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # req_id -> (input_id -> encoder_output)
         self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
 
-        # Set up speculative decoding.
-        self.use_spec_decode = False
         self.use_aux_hidden_state_outputs = False
-        if self.speculative_config:
-            self.use_spec_decode = True
-
-            # NOTE(Jiayi): currently we put the entire draft model on
-            # the last PP rank. This is not ideal if there are many
-            # layers in the draft model.
-            if get_pp_group().is_last_rank:
-                if self.speculative_config.method == "ngram":
-                    self.drafter = NgramProposer(self.vllm_config)
-                elif self.speculative_config.use_eagle():
-                    self.drafter = EagleProposer(self.vllm_config, self.device,
-                                                 self)  # type: ignore
-                    if self.speculative_config.method == "eagle3":
-                        self.use_aux_hidden_state_outputs = True
-                elif self.speculative_config.method == "medusa":
-                    self.drafter = MedusaProposer(
-                        vllm_config=self.vllm_config,
-                        device=self.device)  # type: ignore
-                else:
-                    raise ValueError("Unknown speculative decoding method: "
-                                     f"{self.speculative_config.method}")
-                self.rejection_sampler = RejectionSampler()
+        # Set up speculative decoding.
+        # NOTE(Jiayi): currently we put the entire draft model on
+        # the last PP rank. This is not ideal if there are many
+        # layers in the draft model.
+        if self.speculative_config and get_pp_group().is_last_rank:
+            if self.speculative_config.method == "ngram":
+                self.drafter = NgramProposer(self.vllm_config)
+            elif self.speculative_config.use_eagle():
+                self.drafter = EagleProposer(self.vllm_config, self.device,
+                                             self)  # type: ignore
+                if self.speculative_config.method == "eagle3":
+                    self.use_aux_hidden_state_outputs = True
+            elif self.speculative_config.method == "medusa":
+                self.drafter = MedusaProposer(
+                    vllm_config=self.vllm_config,
+                    device=self.device)  # type: ignore
+            else:
+                raise ValueError("Unknown speculative decoding method: "
+                                 f"{self.speculative_config.method}")
+            self.rejection_sampler = RejectionSampler()
 
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
@@ -1318,7 +1314,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         for i in discard_sampled_tokens_req_indices:
             valid_sampled_token_ids[i].clear()
 
-        if not self.use_spec_decode:
+        if not self.speculative_config:
             # Speculative decoding is not enabled.
             spec_token_ids = None
         elif self.speculative_config.method == "ngram":
@@ -1740,7 +1736,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             else:
                 hidden_states = outputs
 
-            if self.use_spec_decode and self.speculative_config.use_eagle():
+            if self.speculative_config and self.speculative_config.use_eagle():
                 assert isinstance(self.drafter, EagleProposer)
                 self.drafter.dummy_run(num_tokens)
 
@@ -1795,7 +1791,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                     "initializing the engine.") from e
             else:
                 raise e
-        if self.use_spec_decode:
+        if self.speculative_config:
             draft_token_ids = [[0] for _ in range(num_reqs)]
             dummy_spec_decode_metadata = SpecDecodeMetadata.make_dummy(
                 draft_token_ids, self.device)
-- 
GitLab


From 643622ba465f0bbcb07e1d81c60dac35ed994ebf Mon Sep 17 00:00:00 2001
From: Akshat Tripathi <akshat@krai.ai>
Date: Wed, 28 May 2025 20:59:09 +0100
Subject: [PATCH 028/274] [Hardware][TPU][V1] Multi-LoRA Optimisations for the
 V1 TPU backend (#15655)

Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Signed-off-by: xihajun <junfan@krai.ai>
Signed-off-by: Jorge de Freitas <jorge.de-freitas22@imperial.ac.uk>
Signed-off-by: Jorge de Freitas <jorge@krai.ai>
Co-authored-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: xihajun <junfan@krai.ai>
Co-authored-by: Jorge de Freitas <jorge.de-freitas22@imperial.ac.uk>
Co-authored-by: Jorge de Freitas <jorge@krai.ai>
---
 .../scripts/hardware_ci/run-tpu-v1-test.sh    |   6 +-
 tests/tpu/lora/test_pallas_kernels.py         |  73 --------
 vllm/lora/models.py                           |   4 +-
 vllm/lora/ops/xla_ops/lora_ops.py             | 140 ++++++++++------
 vllm/lora/ops/xla_ops/pallas.py               | 133 ---------------
 vllm/lora/punica_wrapper/punica_tpu.py        | 157 +++++++++++++-----
 vllm/v1/worker/lora_model_runner_mixin.py     |  57 +++++--
 vllm/v1/worker/tpu_model_runner.py            |  82 +++++++--
 vllm/v1/worker/tpu_worker.py                  |   7 +-
 9 files changed, 325 insertions(+), 334 deletions(-)
 delete mode 100644 tests/tpu/lora/test_pallas_kernels.py
 delete mode 100644 vllm/lora/ops/xla_ops/pallas.py

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index eb82da3a8..5dd53420d 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -122,10 +122,8 @@ run_and_track_test 11 "test_struct_output_generate.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
 run_and_track_test 12 "test_moe_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
-
-# Disable the TPU LoRA tests until the feature is activated
-# run_and_track_test 13 "test_lora (directory)" \
-#     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/"
+run_and_track_test 13 "test_lora.py" \
+    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
 
 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then
diff --git a/tests/tpu/lora/test_pallas_kernels.py b/tests/tpu/lora/test_pallas_kernels.py
deleted file mode 100644
index 8bd47de50..000000000
--- a/tests/tpu/lora/test_pallas_kernels.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import torch
-
-# Required to register the custom ops
-import vllm.lora.ops.xla_ops.pallas  # noqa # pylint: disable=unused-import
-
-N_TOKENS = [16, 1024, 4096]
-HIDDEN_SIZES = [1024, 2048, 4096]
-
-DTYPES = [torch.bfloat16]
-NUM_LORA = [1, 4, 16]
-RANKS = [32, 256, 512]
-
-
-def generate_test_data(T, D, L, N, seed, dtype=torch.float32):
-    """
-    Inputs: (All integers)
-        T: Total number of tokens
-        D: Input dim
-        L: LoRA Dim
-        N: N LoRAs
-    
-    Outputs:
-        inputs:     torch.Tensor - shape (T, D)
-        loras:      torch.Tensor - shape (N, 1, L, D)
-        idxs:       torch.Tensor - shape (T, ) - all values must be in [0, N)
-        
-        ref_output: torch.Tensor - shape (T, L) - inputs @ loras[idxs].T
-    """
-    torch.manual_seed(seed)
-
-    inputs = torch.randn((T, D), device="xla", dtype=dtype)
-    loras = torch.randn((N, 1, L, D), device="xla", dtype=dtype)
-    idxs = torch.randint(0, N, (T, ), dtype=torch.int32, device="xla")
-
-    ref_output = ref_bgmv(inputs, loras, idxs)
-    return inputs, loras, idxs, ref_output
-
-
-def ref_bgmv(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.Tensor):
-    selected_loras = loras[idxs]
-    if len(selected_loras.shape) == 4:
-        selected_loras = selected_loras.squeeze(axis=1)
-
-    batch_size, output_size, input_size = selected_loras.shape
-    return (selected_loras @ inputs.reshape(
-        (batch_size, input_size, 1))).reshape((batch_size, output_size))
-
-
-# Parameterize tests with various shapes and dtypes
-@pytest.mark.parametrize("T", N_TOKENS)
-@pytest.mark.parametrize("D", HIDDEN_SIZES)
-@pytest.mark.parametrize("L", RANKS)
-@pytest.mark.parametrize("N", NUM_LORA)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-@pytest.mark.parametrize("seed", [0])
-def test_bgmv_correctness(T, D, L, N, dtype, op_type, seed):
-    if op_type == "expand":
-        D, L = L, D
-
-    inputs, loras, idxs, ref_output = generate_test_data(
-        T, D, L, N, seed, dtype)
-
-    # Run bgmv
-    output = torch.ops.xla.bgmv(inputs, loras, idxs)
-
-    # Make sure we have no NaNs
-    assert not torch.any(torch.isnan(output))
-
-    # Compare with reference output
-    assert torch.allclose(output, ref_output, rtol=1e-2, atol=1e-2)
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index af5cebdf2..d3b1374a9 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -200,7 +200,7 @@ class LoRAModel(AdapterModel):
             weights_mapper: Optional[WeightsMapper] = None,
             tensorizer_config_dict: Optional[dict] = None) -> "LoRAModel":
         """Create a LoRAModel from a local checkpoint.
-        
+
         Args:
             lora_dir: The local path that has lora data.
             expected_lora_modules: Name of modules that are expected to be
@@ -620,7 +620,7 @@ class LoRAModelManager(AdapterModelManager):
     def _filter_unsupported_mm_module(self, module_name: str) -> bool:
         """
         Regarding multimodal models, vLLM currently only supports adding LoRA to
-        language model. LoRA for other modules, such as the vision tower, will 
+        language model. LoRA for other modules, such as the vision tower, will
         be filtered out.
         """
         if self.supports_mm:
diff --git a/vllm/lora/ops/xla_ops/lora_ops.py b/vllm/lora/ops/xla_ops/lora_ops.py
index acbec0cfa..dff4d5181 100644
--- a/vllm/lora/ops/xla_ops/lora_ops.py
+++ b/vllm/lora/ops/xla_ops/lora_ops.py
@@ -1,63 +1,99 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import jax
+import jax.numpy as jnp
 import torch
+import torch.nn.functional as F
+import torch_xla.core.xla_builder as xb
+from torch.library import impl
+from torch_xla.experimental.custom_kernel import XLA_LIB, jax_import_guard
 
-# Required to register the custom ops
-import vllm.lora.ops.xla_ops.pallas  # noqa # pylint: disable=unused-import
 
+@jax.jit
+def bgmv_jax(inputs, loras, idxs):
+    return jnp.einsum(
+        "td,tX,Xld->tl",
+        inputs,
+        jax.nn.one_hot(idxs, loras.shape[0], dtype=inputs.dtype),
+        loras,
+    )
 
-def bgmv_expand(inputs: torch.Tensor,
-                lora_b_weights: torch.Tensor,
-                output_tensor: torch.Tensor,
-                lora_indices_tensor: torch.Tensor,
-                add_inputs: bool = True):
+
+XLA_LIB.define("bgmv(Tensor inputs, Tensor loras, Tensor idxs) -> Tensor")
+
+
+@impl(XLA_LIB, "bgmv", "XLA")
+def bgmv_xla(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.IntTensor):
+    if len(loras.shape) == 4:
+        loras = loras.squeeze(axis=1)
+
+    jax_import_guard()
+    return xb.call_jax(bgmv_jax, (inputs, loras, idxs))
+
+
+@impl(XLA_LIB, "bgmv", "CompositeExplicitAutograd")
+def bgmv_non_xla(inputs: torch.Tensor, loras: torch.Tensor,
+                 idxs: torch.IntTensor):
+    T, _ = inputs.shape
+    if len(loras.shape) == 4:
+        loras = loras.squeeze(axis=1)
+    _, L, _ = loras.shape
+
+    return torch.empty((T, L), device=inputs.device)
+
+
+def bgmv_expand(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    add_inputs: bool = True,
+):
     """
     Args:
         inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size].
-        
-        lora_b_weights (torch.Tensor): LoRA weights of shape 
+
+        lora_b_weights (torch.Tensor): LoRA weights of shape
             [num_loras, lora_rank, hidden_size].
-        
-        output_tensor (torch.Tensor): output tensor of shape 
+
+        output_tensor (torch.Tensor): output tensor of shape
             [num_tokens, hidden_size * num_slices].
-        
-        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] 
+
+        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens]
             indicating which LoRA matrix to use for each token.
-        add_inputs (bool): Whether or not to add the input tensor to the output 
+        add_inputs (bool): Whether or not to add the input tensor to the output
             tensor.
     """
 
     outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor)
-    n_tokens = outputs.size(0)
 
     limit = output_tensor.shape[0]
     if outputs.shape[0] == 1 and output_tensor.shape[0] != 1:
         limit = 1
 
-    outputs = torch.cat(
-        (outputs,
-         torch.zeros((n_tokens, output_tensor.shape[1] - outputs.shape[1]),
-                     device=outputs.device)),
-        dim=1)
+    if output_tensor.shape[1] > outputs.shape[1]:
+        outputs = F.pad(outputs,
+                        (0, output_tensor.shape[1] - outputs.shape[1], 0, 0))
 
     if add_inputs:
-        return output_tensor + outputs[:limit, :]
+        return output_tensor + outputs[:limit, :output_tensor.shape[1]]
     else:
-        return outputs[:limit, :]
+        return outputs[:limit, :output_tensor.shape[1]]
 
 
-def bgmv_shrink(inputs: torch.Tensor,
-                lora_b_weights: torch.Tensor,
-                output_tensor: torch.Tensor,
-                lora_indices_tensor: torch.Tensor,
-                scaling: float = 1.0):
+def bgmv_shrink(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    scaling: float = 1.0,
+):
     """
     Args:
         inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size].
-        lora_b_weights (torch.Tensor): LoRA weights of shape 
+        lora_b_weights (torch.Tensor): LoRA weights of shape
             [num_loras, lora_rank, hidden_size].
         output_tensor (torch.Tensor): (Unused) output tensor (placeholder).
-        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] 
+        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens]
             indicating which LoRA matrix to use for each token.
         scaling (float, optional): Scalar multiplier applied to the output.
     """
@@ -66,39 +102,41 @@ def bgmv_shrink(inputs: torch.Tensor,
                                         lora_indices_tensor)
 
 
-def bgmv_expand_slice(inputs: torch.Tensor,
-                      lora_b_weights: torch.Tensor,
-                      output_tensor: torch.Tensor,
-                      lora_indices_tensor: torch.Tensor,
-                      slice_offset: int,
-                      slice_size: int,
-                      add_inputs: bool = True):
+def bgmv_expand_slice(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    slice_offset: int,
+    slice_size: int,
+    add_inputs: bool = True,
+):
     """
     Args:
         inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size].
-        
-        lora_b_weights (torch.Tensor): LoRA weights of shape 
+
+        lora_b_weights (torch.Tensor): LoRA weights of shape
             [num_loras, lora_rank, hidden_size].
-        
-        output_tensor (torch.Tensor): output tensor of shape 
+
+        output_tensor (torch.Tensor): output tensor of shape
             [num_tokens, hidden_size * num_slices].
-        
-        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] 
+
+        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens]
             indicating which LoRA matrix to use for each token.
-        add_inputs (bool): Whether or not to add the input tensor to the output 
+        add_inputs (bool): Whether or not to add the input tensor to the output
             tensor.
     """
     outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor)
-    n_tokens = outputs.size(0)
 
-    outputs = torch.cat((
-        torch.zeros((n_tokens, slice_offset), device=outputs.device),
+    outputs = F.pad(
         outputs,
-        torch.zeros(
-            (n_tokens, output_tensor.shape[1] - (slice_offset + slice_size)),
-            device=outputs.device),
-    ),
-                        dim=1)
+        (
+            slice_offset,
+            output_tensor.shape[1] - (slice_offset + slice_size),
+            0,
+            0,
+        ),
+    )
 
     if add_inputs:
         return output_tensor + outputs
diff --git a/vllm/lora/ops/xla_ops/pallas.py b/vllm/lora/ops/xla_ops/pallas.py
deleted file mode 100644
index 35dc30753..000000000
--- a/vllm/lora/ops/xla_ops/pallas.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-import functools
-
-import jax
-import jax.numpy as jnp
-import torch
-from jax.experimental import pallas as pl
-from jax.experimental.pallas import tpu as pltpu
-from torch.library import impl
-from torch_xla.experimental.custom_kernel import (XLA_LIB, jax_import_guard,
-                                                  make_kernel_from_pallas)
-
-# TODO: Tune these
-TOKENS_BLOCK = 16
-LORA_RANK_BLOCK = 128
-DIM_BLOCK_SIZE = 128
-
-
-def _bgmv_kernel(bT: int, bL: int, idx_ref, inp_ref, lora_ref, out_ref,
-                 acc_ref, mask_ref):
-
-    @pl.when(pl.program_id(2) == 0)
-    def _():
-        acc_ref[...] = jnp.zeros_like(acc_ref[...], dtype=jnp.float32)
-
-    t = pl.program_id(0)
-
-    for i in range(bT):
-        idx = idx_ref[i + bT * t]
-        mask_ref[...] = jnp.zeros_like(mask_ref[...], dtype=jnp.float32)
-        mask_ref[i, :] = jnp.ones((bL, ), dtype=jnp.float32)
-
-        acc_ref[...] += jax.lax.dot_general(
-            inp_ref[...],
-            lora_ref[idx, ...], (((1, ), (1, )), ((), ())),
-            preferred_element_type=jnp.float32) * mask_ref[...]
-
-    @pl.when(pl.program_id(2) == pl.num_programs(2) - 1)
-    def _():
-        out_ref[...] = acc_ref[...].astype(out_ref.dtype)
-
-
-@jax.jit
-def _bgmv(
-    idxs: jax.Array,  # (T, ) int32
-    inputs: jax.Array,  # (T, D) model dtype
-    loras: jax.Array  # (N, L, D) model dtype
-) -> jax.Array:  # (T, L) model dtype
-    T, D = inputs.shape
-    N, L, _ = loras.shape
-
-    return pl.pallas_call(
-        kernel=functools.partial(_bgmv_kernel, TOKENS_BLOCK, LORA_RANK_BLOCK),
-        out_shape=jax.ShapeDtypeStruct((T, L), dtype=inputs.dtype),
-        grid_spec=pltpu.PrefetchScalarGridSpec(
-            num_scalar_prefetch=1,
-            grid=(T // TOKENS_BLOCK, L // LORA_RANK_BLOCK,
-                  D // DIM_BLOCK_SIZE),
-            in_specs=[
-                pl.BlockSpec((TOKENS_BLOCK, DIM_BLOCK_SIZE),
-                             lambda i, j, k, block_idx: (i, k)),
-                pl.BlockSpec((N, LORA_RANK_BLOCK, DIM_BLOCK_SIZE),
-                             lambda i, j, k, block_idx: (0, j, k)),
-            ],
-            out_specs=pl.BlockSpec((TOKENS_BLOCK, LORA_RANK_BLOCK),
-                                   lambda i, j, k, block_idx: (i, j)),
-            scratch_shapes=[
-                pltpu.VMEM((TOKENS_BLOCK, LORA_RANK_BLOCK), jnp.float32),
-                pltpu.VMEM((TOKENS_BLOCK, LORA_RANK_BLOCK), jnp.float32)
-            ]),
-        compiler_params=pltpu.TPUCompilerParams(
-            dimension_semantics=("parallel", "parallel", "arbitrary")),
-        name="bgmv")(idxs, inputs, loras)
-
-
-def bgmv_shape_function(idxs, inputs, loras):
-    T, _ = inputs.shape
-    _, L, _ = loras.shape
-
-    return [((T, L), inputs.dtype)]
-
-
-XLA_LIB.define("bgmv(Tensor inputs, Tensor loras, Tensor idxs) -> Tensor", )
-
-
-@impl(XLA_LIB, "bgmv", "XLA")
-def bgmv_xla(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.IntTensor):
-    inputs = inputs.to(dtype=loras.dtype)
-
-    if len(loras.shape) == 4:
-        loras = loras.squeeze(axis=1)
-
-    jax_import_guard()
-    kernel = make_kernel_from_pallas(_bgmv, bgmv_shape_function)
-
-    T, _ = inputs.shape
-    _, L, D = loras.shape
-
-    # Pad the loras' rank if it's too low. This is to allow it to fit in a TPU
-    # register. This has to happen in pytorch, doing it in Jax will lead to NaNs
-    L1 = L
-    if LORA_RANK_BLOCK > L or L % LORA_RANK_BLOCK != 0:
-        L1 = (L // LORA_RANK_BLOCK + 1) * LORA_RANK_BLOCK
-
-    D1 = D
-    if DIM_BLOCK_SIZE > D or D % DIM_BLOCK_SIZE != 0:
-        D1 = (D // DIM_BLOCK_SIZE + 1) * DIM_BLOCK_SIZE
-
-    T1 = T
-    if TOKENS_BLOCK > T or T % TOKENS_BLOCK != 0:
-        T1 = (T // TOKENS_BLOCK + 1) * TOKENS_BLOCK
-
-    if D1 != D or L1 != L:
-        loras = torch.nn.functional.pad(loras, (0, D1 - D, 0, L1 - L, 0, 0))
-    if D1 != D or T1 != T:
-        inputs = torch.nn.functional.pad(inputs, (0, D1 - D, 0, T1 - T))
-        if T1 != T:
-            idxs = torch.nn.functional.pad(idxs, ((0, T1 - T)))
-
-    return kernel(idxs, inputs, loras)[:T, :L]
-
-
-@impl(XLA_LIB, "bgmv", "CompositeExplicitAutograd")
-def bgmv_non_xla(inputs: torch.Tensor, loras: torch.Tensor,
-                 idxs: torch.IntTensor):
-    T, _ = inputs.shape
-
-    if len(loras.shape) == 4:
-        loras = loras.squeeze(axis=1)
-
-    _, L, _ = loras.shape
-
-    return torch.empty((T, L), device=inputs.device)
diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py
index f3153c6da..0556e583f 100644
--- a/vllm/lora/punica_wrapper/punica_tpu.py
+++ b/vllm/lora/punica_wrapper/punica_tpu.py
@@ -1,11 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Optional, Union
+import math
+from typing import TYPE_CHECKING, Optional, Union
 
 import torch
 import torch.nn.functional as F
+import torch_xla.core.xla_model as xm
 
 from vllm.lora.ops.xla_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink
+from vllm.lora.punica_wrapper.utils import convert_mapping
+
+if TYPE_CHECKING:
+    # avoid circuit import
+    from vllm.lora.layers import LoRAMapping
+    from vllm.lora.models import LongContextLoRAContext
 
 from .punica_base import PunicaWrapperBase
 
@@ -31,6 +39,15 @@ class PunicaWrapperTPU(PunicaWrapperBase):
         self._sampler_indices_padded = self._sampler_indices_padded.to(
             dtype=torch.int32)
 
+        torch.ops.xla.dynamo_set_buffer_donor_(self._token_lora_indices, True)
+        torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices, True)
+        torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices_padded,
+                                               True)
+        torch.ops.xla.dynamo_set_buffer_donor_(self._embeddings_indices, True)
+        torch.ops.xla.dynamo_set_buffer_donor_(self._long_lora_indices, True)
+        torch.ops.xla.dynamo_set_buffer_donor_(self._lora_indices_per_batch,
+                                               True)
+
         torch._dynamo.mark_dynamic(self._token_lora_indices, 0)
         torch._dynamo.mark_dynamic(self._embeddings_indices, 1)
         torch._dynamo.mark_dynamic(self._sampler_indices_padded, 0)
@@ -55,15 +72,11 @@ class PunicaWrapperTPU(PunicaWrapperBase):
 
     def shrink(
         self,
-        y: torch.Tensor,
         x: torch.Tensor,
         w_t_all: torch.Tensor,
         scale: float,
     ):
-        if self.no_lora:
-            return y
-        return bgmv_shrink(x, w_t_all, y, self._get_token_lora_indices(x),
-                           scale)
+        return bgmv_shrink(x, w_t_all, self._get_token_lora_indices(x), scale)
 
     def expand(self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor,
                add_inputs: bool):
@@ -72,7 +85,7 @@ class PunicaWrapperTPU(PunicaWrapperBase):
 
     def expand_slice(self, y: torch.Tensor, x: torch.Tensor,
                      w_t_all: torch.Tensor, y_offset: int, y_slice_size: int,
-                     y_total_size: int, add_inputs: bool) -> torch.Tensor:
+                     add_inputs: bool) -> torch.Tensor:
         return bgmv_expand_slice(x, w_t_all, y,
                                  self._get_token_lora_indices(x), y_offset,
                                  y_slice_size, add_inputs)
@@ -98,9 +111,8 @@ class PunicaWrapperTPU(PunicaWrapperBase):
         x = x.view(-1, x.shape[-1])
 
         for slice_idx in range(len(lora_a_stacked)):
-            y_s = y[slice_idx]
             lora_s = lora_a_stacked[slice_idx]
-            y_s = self.shrink(y_s, x, lora_s, scale)
+            y_s = self.shrink(x, lora_s, scale)
             y[slice_idx, :, :] = y_s  # type: ignore[index]
         return y
 
@@ -140,15 +152,12 @@ class PunicaWrapperTPU(PunicaWrapperBase):
             y = self._apply_bias(self._get_token_lora_indices(y), y,
                                  output_slices, lora_bias_stacked)
         for slice_idx in range(len(lora_b_stacked)):
-            y = self.expand_slice(
-                y,
-                x[slice_idx],
-                lora_b_stacked[slice_idx],
-                offset_left,
-                output_slices[slice_idx],
-                y_total_size=sum(output_slices),
-                add_inputs=add_inputs,
-            )
+            y = self.expand_slice(y,
+                                  x[slice_idx],
+                                  lora_b_stacked[slice_idx],
+                                  offset_left,
+                                  output_slices[slice_idx],
+                                  add_inputs=add_inputs)
             offset_left += output_slices[slice_idx]
         return y.view_as(y_org)
 
@@ -216,12 +225,10 @@ class PunicaWrapperTPU(PunicaWrapperBase):
 
         if buffer is None:
             r = lora_b_stacked[0].size(-1)
-            # We set the buffer to be float32 by default, consistent with the
-            # triton op
             T = x.size(0)
             buffer = torch.zeros(
                 (len(output_slices), T, r),
-                dtype=torch.float32,
+                dtype=x.dtype,
                 device=x.device,
             )
         buffer = self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
@@ -257,26 +264,16 @@ class PunicaWrapperTPU(PunicaWrapperBase):
             scale (float): Scaling factor.
             buffer (Optional[torch.Tensor]):Default to None.
         """
-        if self.no_lora:
-            return y
-
         y_org = y
         y = y.view(-1, y.shape[-1])
         x = x.view(-1, x.shape[-1])
-        r = lora_b_stacked.size(-1)
-        if buffer is None:
-            # We set the buffer to be float32 by default, consistent with the
-            # triton op
-            buffer = torch.zeros((x.size(0), r),
-                                 dtype=torch.float32,
-                                 device=x.device)
-
-        buffer = bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices,
-                             scale)
+
+        sampler_indices = torch.narrow(self._sampler_indices, 0, 0, x.size(0))
+        buffer = bgmv_shrink(x, lora_a_stacked, sampler_indices, scale)
         y = bgmv_expand(buffer,
                         lora_b_stacked,
                         y,
-                        self.sampler_indices,
+                        sampler_indices,
                         add_inputs=True)
         return y.view_as(y_org)
 
@@ -316,10 +313,92 @@ class PunicaWrapperTPU(PunicaWrapperBase):
 
         return output.view_as(org_output)
 
+    # This performs the same tensor ops as the base method, except it does them
+    # on the CPU then transfers the results to the TPU
+    def _update_base_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: list[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        long_lora_context: Optional["LongContextLoRAContext"] = None,
+    ):
+        # Make sure we don't accidentally collect outside operations
+        xm.mark_step()
+
+        # Pad the prompt mapping to avoid running into recompiles on the TPU
+        # TODO: Should this happen inside mapping internally? If so how can we
+        # avoid having backend specific LoRAMapping classes?
+        mapping.prompt_mapping = self._pad_prompt_mapping(
+            mapping.prompt_mapping)
+
+        (
+            base_indices,
+            sampler_indices,
+            sampler_indices_padded,
+            embeddings_indices,
+            long_lora_offsets_tensor,
+            indices_len,
+        ) = convert_mapping(
+            mapping,
+            lora_index_to_id,
+            max_loras,
+            vocab_size,
+            extra_vocab_size,
+            "cpu",
+            long_lora_context,
+        )
+        self._token_lora_indices = self._pad_to_shape(
+            base_indices, self._token_lora_indices.shape,
+            dims=1).to(self.device)
+        self._sampler_indices = self._pad_to_shape(sampler_indices,
+                                                   self._sampler_indices.shape,
+                                                   dims=1).to(self.device)
+        self._sampler_indices_padded = self._pad_to_shape(
+            sampler_indices_padded, self._sampler_indices_padded.shape,
+            dims=1).to(self.device)
+        self._embeddings_indices = self._pad_to_shape(
+            embeddings_indices, self._embeddings_indices.shape,
+            dims=2).to(self.device)
+        if long_lora_offsets_tensor is not None:
+            self._long_lora_indices = self._pad_to_shape(
+                long_lora_offsets_tensor,
+                self._long_lora_indices.shape,
+                dims=1).to(self.device)
+        else:
+            zeroed = torch.zeros_like(self._long_lora_indices.cpu(),
+                                      dtype=torch.int32)
+            self._long_lora_indices = zeroed.to(self.device)
+        self.indices_len[:] = indices_len
+
     def _update_prefill_metadata(self,
                                  token_lora_tensor: torch.Tensor) -> None:
         self.batch_size = 1
-        self._lora_indices_per_batch[:self.batch_size].copy_(
-            token_lora_tensor[:self.batch_size])
-        # TODO: .item() is extremely inefficient on TPU, so find a way around it
-        self.no_lora = torch.all(token_lora_tensor == -1).item()
+        self._lora_indices_per_batch[:self.
+                                     batch_size] = token_lora_tensor[:self.
+                                                                     batch_size]
+
+    def _pad_prompt_mapping(
+            self, prompt_mapping: tuple[int, ...]) -> tuple[int, ...]:
+        num_reqs = len(prompt_mapping)
+
+        # From vllm/v1/worker/tpu_model_runner:51, but need to avoid a circular
+        # import
+        MIN_NUM_SEQS = 8
+
+        padded_num_reqs = max(2**math.ceil(math.log2(num_reqs)), MIN_NUM_SEQS)
+        pad_len = padded_num_reqs - num_reqs
+
+        padding = [-1] * pad_len
+        return tuple(list(prompt_mapping) + padding)
+
+    def _pad_to_shape(self, src, target_shape, dims=1):
+        if dims == 1:
+            pad_len = target_shape[0] - src.shape[0]
+            return F.pad(src, (0, pad_len), value=0).to(torch.int32)
+        else:
+            pad_rows = target_shape[0] - src.shape[0]
+            pad_cols = target_shape[1] - src.shape[1]
+            return F.pad(src, (0, pad_cols, 0, pad_rows),
+                         value=0).to(torch.int32)
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 3cbab840e..eb8ed6221 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -80,8 +80,38 @@ class LoRAModelRunnerMixin:
                                       lora_requests)
 
     @contextmanager
-    def maybe_dummy_run_with_lora(self, lora_config: LoRAConfig,
-                                  num_scheduled_tokens: np.ndarray):
+    def maybe_setup_dummy_loras(self, lora_config):
+        if lora_config is None:
+            yield
+        else:
+            # __enter__ code
+            assert self.lora_manager is not None, "LoRA is not enabled"
+
+            num_loras = lora_config.max_loras
+
+            # Make dummy lora requests
+            lora_requests: set[LoRARequest] = {
+                LoRARequest(lora_name=f"warmup_{lora_id}",
+                            lora_int_id=lora_id,
+                            lora_path="/not/a/real/path")
+                for lora_id in range(1, num_loras + 1)
+            }
+
+            with self.lora_manager.dummy_lora_cache():
+                # Add the dummy LoRAs here so _set_active_loras doesn't try to
+                # load from disk.
+                for lr in lora_requests:
+                    self.lora_manager.add_dummy_lora(
+                        lr, rank=self.LORA_WARMUP_RANK)
+
+                yield
+
+            # __exit__ code
+            self.lora_manager.remove_all_adapters()
+
+    @contextmanager
+    def maybe_select_dummy_loras(self, lora_config: LoRAConfig,
+                                 num_scheduled_tokens: np.ndarray):
         if lora_config is None:
             yield
         else:
@@ -108,21 +138,18 @@ class LoRAModelRunnerMixin:
                 for lora_id in range(1, num_loras + 1)
             }
 
-            with self.lora_manager.dummy_lora_cache():
-                # Add the dummy LoRAs here so _set_active_loras doesn't try to
-                # load from disk.
-                for lr in lora_requests:
-                    self.lora_manager.add_dummy_lora(
-                        lr, rank=self.LORA_WARMUP_RANK)
-
-                self._set_active_loras(tuple(prompt_lora_mapping),
-                                       tuple(token_lora_mapping),
-                                       lora_requests)
+            self._set_active_loras(tuple(prompt_lora_mapping),
+                                   tuple(token_lora_mapping), lora_requests)
 
-                yield
+            yield
 
-            # __exit__ code
-            self.lora_manager.remove_all_adapters()
+    @contextmanager
+    def maybe_dummy_run_with_lora(self, lora_config: LoRAConfig,
+                                  num_scheduled_tokens: np.ndarray):
+        with self.maybe_setup_dummy_loras(
+                lora_config), self.maybe_select_dummy_loras(
+                    lora_config, num_scheduled_tokens):
+            yield
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         if not self.lora_manager:
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 46bcf64ed..669908cb5 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -20,6 +20,7 @@ from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
+from vllm.lora.layers import BaseLayerWithLoRA
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs,
@@ -152,6 +153,9 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         self.hidden_size = model_config.get_hidden_size()
         self.vocab_size = model_config.get_vocab_size()
 
+        if self.lora_config is not None:
+            self.vocab_size += self.lora_config.lora_extra_vocab_size
+
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
@@ -591,6 +595,17 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         logits_indices = self.query_start_loc_cpu[1:padded_num_reqs + 1] - 1
         logits_indices = logits_indices.to(self.device)
 
+        if self.lora_config is not None:
+            # We need to respect padding when activating LoRA adapters
+            padded_num_scheduled_tokens_per_req = np.copy(
+                num_scheduled_tokens_per_req
+            )  # Copying to avoid accidental state corruption bugs
+            padded_num_scheduled_tokens_per_req[-1] += \
+                padded_total_num_scheduled_tokens - total_num_scheduled_tokens
+
+            self.set_active_loras(self.input_batch,
+                                  padded_num_scheduled_tokens_per_req)
+
         layer_names = get_layers_from_vllm_config(self.vllm_config,
                                                   Attention).keys()
         per_layer_attn_metadata = {
@@ -916,6 +931,7 @@ class TPUModelRunner(LoRAModelRunnerMixin):
             model = self.load_lora_model(model, self.model_config,
                                          self.scheduler_config,
                                          self.lora_config, self.device)
+            replace_set_lora(model)
 
         # Sync all pending XLA execution during model initialization and weight
         # loading.
@@ -980,7 +996,7 @@ class TPUModelRunner(LoRAModelRunnerMixin):
             for layer_name in layer_names
         }
 
-        with self.maybe_dummy_run_with_lora(
+        with self.maybe_select_dummy_loras(
                 self.lora_config,
                 np.array([num_tokens], dtype=np.int32)), set_forward_context(
                     per_layer_attn_metadata, self.vllm_config, 0):
@@ -989,6 +1005,13 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                              inputs_embeds=inputs_embeds)
         self._hidden_states_dtype = out.dtype
 
+    def _set_active_loras(self, prompt_lora_mapping, token_lora_mapping,
+                          lora_requests) -> None:
+        xm.mark_step()  # Captures input updates
+        super()._set_active_loras(prompt_lora_mapping, token_lora_mapping,
+                                  lora_requests)
+        xm.mark_step()  # Captures metadata updates
+
     def _precompile_mm_encoder(self) -> None:
         # Pre-compile MM encoder for all supported data modalities.
         hf_config = self.vllm_config.model_config.hf_config
@@ -1151,7 +1174,10 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                         generate_params_if_all_greedy,
                     ))
                 sampling_metadata.all_greedy = all_greedy
-                self.sample_from_logits(dummy_logits, sampling_metadata)
+                with self.maybe_select_dummy_loras(
+                        self.lora_config, np.array([num_reqs],
+                                                   dtype=np.int32)):
+                    self.sample_from_logits(dummy_logits, sampling_metadata)
             logger.info("  -- num_seqs: %d", num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
@@ -1167,7 +1193,9 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                                        dtype=self._hidden_states_dtype)
             dummy_tokens = torch.zeros((num_reqs, 1),
                                        dtype=torch.int64).to(self.device)
-            self.gather_logprobs(dummy_logits, dummy_tokens)
+            with self.maybe_select_dummy_loras(
+                    self.lora_config, np.array([num_reqs], dtype=np.int32)):
+                self.gather_logprobs(dummy_logits, dummy_tokens)
             logger.info("  -- num_seqs: %d", num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
@@ -1178,13 +1206,14 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         """
         Precompile all the subgraphs with possible input shapes.
         """
-        self._precompile_mm_encoder()
-        self._precompile_backbone()
-        self._precompile_select_hidden_states()
-        self._precompile_compute_logits()
-        self._precompile_structured_decoding()
-        self._precompile_sample_from_logits()
-        self._precompile_gather_logprobs()
+        with self.maybe_setup_dummy_loras(self.lora_config):
+            self._precompile_mm_encoder()
+            self._precompile_backbone()
+            self._precompile_select_hidden_states()
+            self._precompile_compute_logits()
+            self._precompile_structured_decoding()
+            self._precompile_sample_from_logits()
+            self._precompile_gather_logprobs()
 
     def profile_run(
         self,
@@ -1467,11 +1496,11 @@ def _get_token_paddings(min_token_size: int, max_token_size: int,
                         padding_gap: int) -> list[int]:
     """Generate a list of padding size, starting from min_token_size, 
     ending with a number that can cover max_token_size
-    
+
     If padding_gap == 0 then:
         increase 2X each time (exponential)
     else:
-        first increase the size to twice, 
+        first increase the size to twice,
         then increase the padding size by padding_gap.
     """
     # assert min_token_size is power of 2
@@ -1508,3 +1537,32 @@ def _get_padded_token_len(paddings: list[int], x: int) -> int:
     index = bisect.bisect_left(paddings, x)
     assert index < len(paddings)
     return paddings[index]
+
+
+def replace_set_lora(model):
+
+    def _tpu_set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
+    ):
+        # TODO: The integer index leads to a recompilation, but converting it
+        # to a tensor doesn't seem to work anymore. This might be fixed with a
+        # later release of torch_xla.
+        self._original_set_lora(index, lora_a, lora_b, embeddings_tensor, bias)
+        xm.mark_step()
+
+    def _tpu_reset_lora(self, index: int):
+        self._original_reset_lora(index)
+        xm.mark_step()
+
+    for _, module in model.named_modules():
+        if isinstance(module, BaseLayerWithLoRA):
+            module._original_set_lora = module.set_lora
+            module._original_reset_lora = module.reset_lora
+            module.set_lora = _tpu_set_lora.__get__(module, module.__class__)
+            module.reset_lora = _tpu_reset_lora.__get__(
+                module, module.__class__)
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index fa4eb30cc..0707e17af 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -83,10 +83,6 @@ class TPUWorker:
         if self.model_config.seed is None:
             self.model_config.seed = 0
 
-        if vllm_config.lora_config is not None:
-            raise NotImplementedError(
-                "The V1 TPU backend doesn't support LoRA serving")
-
     def init_device(self):
         os.environ["PJRT_DEVICE"] = "TPU"
         # Note: Currently the XLA compiler wrongly uses 2D ring strategy on 1D
@@ -166,7 +162,8 @@ class TPUWorker:
             runner_kv_caches)
 
         # `max_num_tokens >= max_num_batched_tokens` due to padding.
-        self.model_runner.profile_run(self.model_runner.max_num_tokens)
+        with self.model_runner.maybe_setup_dummy_loras(self.lora_config):
+            self.model_runner.profile_run(self.model_runner.max_num_tokens)
 
         # Synchronize before measuring the memory usage.
         xm.wait_device_ops()
-- 
GitLab


From 6dbe5b5c934268387fb03485a5625b1793049d8c Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 28 May 2025 22:32:19 +0100
Subject: [PATCH 029/274] Remove checks for `None` for fields which should
 never be `None` (#17985)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config.py | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 4d9ca580f..fe2ad70f5 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4282,25 +4282,22 @@ class VllmConfig:
             self.model_config.verify_dual_chunk_attention_config(
                 self.load_config)
 
-        if self.cache_config is not None:
-            self.cache_config.verify_with_parallel_config(self.parallel_config)
+        self.cache_config.verify_with_parallel_config(self.parallel_config)
 
-        if self.lora_config:
+        if self.lora_config is not None:
             self.lora_config.verify_with_cache_config(self.cache_config)
             self.lora_config.verify_with_model_config(self.model_config)
             self.lora_config.verify_lora_support()
-        if self.prompt_adapter_config:
+        if self.prompt_adapter_config is not None:
             self.prompt_adapter_config.verify_with_model_config(
                 self.model_config)
 
-        if self.quant_config is None and \
-            self.model_config is not None and self.load_config is not None:
+        if self.quant_config is None and self.model_config is not None:
             self.quant_config = VllmConfig._get_quantization_config(
                 self.model_config, self.load_config)
 
         from vllm.platforms import current_platform
-        if self.scheduler_config is not None and \
-            self.model_config is not None and \
+        if self.model_config is not None and \
             self.scheduler_config.chunked_prefill_enabled and \
             self.model_config.dtype == torch.float32 and \
             current_platform.get_device_capability() == (7, 5):
@@ -4335,8 +4332,7 @@ class VllmConfig:
 
         self._set_cudagraph_sizes()
 
-        if self.cache_config is not None and \
-            self.cache_config.cpu_offload_gb > 0 and \
+        if self.cache_config.cpu_offload_gb > 0 and \
             self.compilation_config.level != CompilationLevel.NO_COMPILATION \
                 and not envs.VLLM_USE_V1:
             logger.warning(
@@ -4358,16 +4354,16 @@ class VllmConfig:
                 "full_cuda_graph is not supported with "
                 "cascade attention. Disabling cascade attention.")
             self.model_config.disable_cascade_attn = True
-            if self.cache_config is not None:
-                self.cache_config.enable_prefix_caching = False
+            self.cache_config.enable_prefix_caching = False
 
-        if (self.kv_events_config
+        if (self.kv_events_config is not None
                 and self.kv_events_config.enable_kv_cache_events
                 and not self.cache_config.enable_prefix_caching):
             logger.warning(
                 "KV cache events are on, but prefix caching is not enabled."
                 "Use --enable-prefix-caching to enable.")
-        if (self.kv_events_config and self.kv_events_config.publisher != "null"
+        if (self.kv_events_config is not None
+                and self.kv_events_config.publisher != "null"
                 and not self.kv_events_config.enable_kv_cache_events):
             logger.warning("KV cache events are disabled,"
                            "but the scheduler is configured to publish them."
-- 
GitLab


From 7951d78738581c336db7c1a77f94f1fea8f09fca Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Wed, 28 May 2025 18:55:30 -0400
Subject: [PATCH 030/274] [Core] Enable CUDA graphs for DP + All2All kernels 
 (#18724)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 vllm/forward_context.py                       | 63 ++++++++++++-------
 vllm/model_executor/layers/fused_moe/layer.py | 42 ++++++++++++-
 vllm/platforms/cuda.py                        | 11 ----
 vllm/v1/worker/gpu_model_runner.py            | 21 ++++++-
 4 files changed, 100 insertions(+), 37 deletions(-)

diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 3c8083e3d..592ca650a 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -10,7 +10,7 @@ import torch
 import torch.distributed as dist
 
 import vllm.envs as envs
-from vllm.config import VllmConfig
+from vllm.config import ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
@@ -30,6 +30,44 @@ class DPMetadata:
     max_tokens_across_dp_cpu: torch.Tensor
     cu_tokens_across_dp_cpu: torch.Tensor
 
+    @staticmethod
+    def num_tokens_across_dp(num_tokens: int, dp_size: int,
+                             dp_rank: int) -> torch.Tensor:
+        """
+        Gather the num_tokens across all DP ranks and return results in a
+        CPU tensor of size dp_size.
+        """
+        num_tokens_across_dp = [0] * dp_size
+        num_tokens_across_dp[dp_rank] = num_tokens
+        num_tokens_tensor = torch.tensor(num_tokens_across_dp,
+                                         device="cpu",
+                                         dtype=torch.int32)
+        from vllm.distributed.parallel_state import get_dp_group
+        dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
+        return num_tokens_tensor
+
+    @staticmethod
+    def make(parallel_config: ParallelConfig, attn_metadata: Any,
+             num_tokens: int) -> "DPMetadata":
+
+        assert parallel_config.data_parallel_size > 1
+        dp_size = parallel_config.data_parallel_size
+        dp_rank = parallel_config.data_parallel_rank
+        if attn_metadata is not None and hasattr(attn_metadata,
+                                                 "num_prefill_tokens"):
+            # for v0 attention backends
+            batchsize = attn_metadata.num_prefill_tokens + \
+                attn_metadata.num_decode_tokens
+        else:
+            # for v1 attention backends or no attn_metadata
+            batchsize = num_tokens
+
+        num_tokens_tensor = DPMetadata.num_tokens_across_dp(
+            batchsize, dp_size, dp_rank)
+        max_tokens_across_dp_cpu = torch.max(num_tokens_tensor)
+        cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0)
+        return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu)
+
 
 @dataclass
 class ForwardContext:
@@ -74,27 +112,8 @@ def set_forward_context(attn_metadata: Any,
         forward_start_time = time.perf_counter()
     dp_metadata: Optional[DPMetadata] = None
     if vllm_config.parallel_config.data_parallel_size > 1:
-        dp_size = vllm_config.parallel_config.data_parallel_size
-        dp_rank = vllm_config.parallel_config.data_parallel_rank
-        if attn_metadata is not None and hasattr(attn_metadata,
-                                                 "num_prefill_tokens"):
-            # for v0 attention backends
-            batchsize = attn_metadata.num_prefill_tokens + \
-                attn_metadata.num_decode_tokens
-        else:
-            # for v1 attention backends or no attn_metadata
-            batchsize = num_tokens
-        num_tokens_across_dp = [0] * dp_size
-        num_tokens_across_dp[dp_rank] = batchsize
-        num_tokens_tensor = torch.tensor(num_tokens_across_dp,
-                                         device="cpu",
-                                         dtype=torch.int32)
-        from vllm.distributed.parallel_state import get_dp_group
-        dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
-        max_tokens_across_dp_cpu = torch.max(num_tokens_tensor)
-        cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0)
-        dp_metadata = DPMetadata(max_tokens_across_dp_cpu,
-                                 cu_tokens_across_dp_cpu)
+        dp_metadata = DPMetadata.make(vllm_config.parallel_config,
+                                      attn_metadata, num_tokens)
 
     global _forward_context
     prev_context = _forward_context
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 29b41e720..838a7c24b 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -828,6 +828,21 @@ class FusedMoE(torch.nn.Module):
 
         self.quant_method.create_weights(layer=self, **moe_quant_params)
 
+        # Chunked all2all staging tensor
+        self.batched_hidden_states: Optional[torch.Tensor] = None
+        self.batched_router_logits: Optional[torch.Tensor] = None
+        if self.moe_parallel_config.use_pplx_kernels:
+            act_dtype = vllm_config.model_config.dtype
+            self.batched_hidden_states = torch.zeros(
+                (MOE_DP_CHUNK_SIZE, self.hidden_size),
+                dtype=act_dtype,
+                device=torch.cuda.current_device())
+
+            self.batched_router_logits = torch.zeros(
+                (MOE_DP_CHUNK_SIZE, self.global_num_experts),
+                dtype=act_dtype,
+                device=torch.cuda.current_device())
+
     @property
     def tp_size(self):
         return self.moe_parallel_config.tp_size
@@ -1217,18 +1232,39 @@ class FusedMoE(torch.nn.Module):
 
     def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
                              full_router_logits: torch.Tensor):
+        assert self.batched_hidden_states is not None
+        assert self.batched_router_logits is not None
+        assert self.batched_hidden_states.dtype == full_hidden_states.dtype
+        assert self.batched_router_logits.dtype == full_router_logits.dtype
+        # Check size compatibility.
+        assert (
+            self.batched_hidden_states.size(-1) == full_hidden_states.size(-1))
+        assert (
+            self.batched_router_logits.size(-1) == full_router_logits.size(-1))
 
         full_final_hidden_states = torch.empty_like(full_hidden_states)
 
         def process_chunk(chunk_start, chunk_end, skip_result_store=False):
+            chunk_size = chunk_end - chunk_start
             hidden_states = full_hidden_states[chunk_start:chunk_end, :]
             router_logits = full_router_logits[chunk_start:chunk_end, :]
 
+            assert (self.batched_hidden_states.size(0)  # type: ignore
+                    >= chunk_size)
+            assert (self.batched_router_logits.size(0)  # type: ignore 
+                    >= chunk_size)
+            staged_hidden_states = self.batched_hidden_states[:
+                                                              chunk_size, :]  # type: ignore
+            staged_router_logits = self.batched_router_logits[:
+                                                              chunk_size, :]  # type: ignore
+            staged_hidden_states.copy_(hidden_states, non_blocking=True)
+            staged_router_logits.copy_(router_logits, non_blocking=True)
+
             # Matrix multiply.
             final_hidden_states = self.quant_method.apply(
                 layer=self,
-                x=hidden_states,
-                router_logits=router_logits,
+                x=staged_hidden_states,
+                router_logits=staged_router_logits,
                 top_k=self.top_k,
                 renormalize=self.renormalize,
                 use_grouped_topk=self.use_grouped_topk,
@@ -1244,7 +1280,7 @@ class FusedMoE(torch.nn.Module):
 
             if not skip_result_store:
                 full_final_hidden_states[chunk_start:chunk_end, :].copy_(
-                    final_hidden_states)
+                    final_hidden_states, non_blocking=True)
 
         ctx = get_forward_context()
         max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 0bed44f73..9f833cbb5 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -106,7 +106,6 @@ class CudaPlatformBase(Platform):
     def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
-        compilation_config = vllm_config.compilation_config
         model_config = vllm_config.model_config
 
         if parallel_config.worker_cls == "auto":
@@ -154,16 +153,6 @@ class CudaPlatformBase(Platform):
                 logger.info(
                     "Forcing kv cache block size to 64 for FlashMLA backend.")
 
-        if (parallel_config.data_parallel_size > 1
-                and compilation_config.use_cudagraph):
-            logger.info(
-                "Data Parallel: Forcing enforce eager to be True since DP is "
-                "currently not supported with CUDA Graphs.")
-            vllm_config.model_config.enforce_eager = True
-            compilation_config.use_cudagraph = False
-            # FIXME: inductor breaks cudagraph (from @bnell)
-            compilation_config.use_inductor = False
-
     @classmethod
     def get_current_memory_usage(cls,
                                  device: Optional[torch.types.Device] = None
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5d5558162..d1195bcfb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -24,7 +24,8 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
 from vllm.distributed.parallel_state import (
     get_pp_group, get_tp_group, graph_capture,
     prepare_communication_buffer_for_model)
-from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.forward_context import (DPMetadata, get_forward_context,
+                                  set_forward_context)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import TensorizerLoader, get_model
@@ -1104,6 +1105,18 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             for k, v in self.intermediate_tensors.items()
         })
 
+    def get_dp_padding(self, num_tokens: int):
+        dp_size = self.vllm_config.parallel_config.data_parallel_size
+        dp_rank = self.vllm_config.parallel_config.data_parallel_rank
+        if dp_size == 1:
+            # Early exit.
+            return 0
+
+        num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
+            num_tokens, dp_size, dp_rank)
+        max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
+        return max_tokens_across_dp_cpu - num_tokens
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -1141,6 +1154,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             else:
                 num_input_tokens = num_scheduled_tokens
 
+        # Padding for DP
+        num_input_tokens += self.get_dp_padding(num_input_tokens)
+
         # _prepare_inputs may reorder the batch, so we must gather multi
         # modal outputs after that to ensure the correct order
         if self.is_multimodal_model:
@@ -1658,6 +1674,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         skip_attn: bool = True,
     ) -> torch.Tensor:
 
+        # Padding for DP
+        num_tokens += self.get_dp_padding(num_tokens)
+
         # Set num_scheduled_tokens based on num_tokens and max_num_seqs
         # for dummy run with LoRA so that the num_reqs collectively
         # has num_tokens in total.
-- 
GitLab


From 269d901734326432d5ef15deaca07364149f9b48 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Wed, 28 May 2025 19:21:46 -0400
Subject: [PATCH 031/274] [Bugfix][ROCm] fix the power of 2 exception from
 triton_unified_attention.py when running llama4 models and unit test fix
 (#18100)

Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .../test_triton_unified_attention.py          |   4 +-
 .../attention/ops/triton_unified_attention.py | 106 +++++++++---------
 2 files changed, 54 insertions(+), 56 deletions(-)

diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py
index 4e15d0025..be3d1879d 100644
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -13,7 +13,9 @@ HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16, 32]
 
 DTYPES = [torch.float16, torch.bfloat16]
-QDTYPES = [None, torch.float8_e4m3fn]
+QDTYPES = [None, torch.float8_e4m3fn] if not current_platform.is_rocm() else [
+    None, torch.float8_e4m3fnuz
+]
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index 4bced7797..87cf333f7 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -29,41 +29,42 @@ def apply_softcap(S, x):
 
 @triton.jit
 def kernel_unified_attention_2d(
-    output_ptr,  # [num_tokens, num_query_heads, head_size]
-    query_ptr,  # [num_tokens, num_query_heads, head_size]
-    key_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
-    value_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
-    block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
-    seq_lens_ptr,  # [num_seqs]
-    alibi_slopes_ptr,  # [num_query_heads]
-    scale,  # float32
-    k_scale,  # float32
-    v_scale,  # float32
-    softcap,  # float32
-    num_query_heads: tl.constexpr,  # int
-    num_queries_per_kv: tl.constexpr,  # int
-    block_table_stride: tl.int64,  # int
-    query_stride_0: tl.int64,  # int
-    query_stride_1: tl.int64,  # int, should be equal to head_size
-    output_stride_0: tl.int64,  # int
-    output_stride_1: tl.int64,  # int, should be equal to head_size
-    BLOCK_SIZE: tl.constexpr,  # int
-    HEAD_SIZE: tl.constexpr,  # int
-    HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
-    USE_ALIBI_SLOPES: tl.constexpr,  # bool
-    USE_SOFTCAP: tl.constexpr,  # bool
-    SLIDING_WINDOW: tl.constexpr,  # int
-    stride_k_cache_0: tl.int64,  # int
-    stride_k_cache_1: tl.int64,  # int
-    stride_k_cache_2: tl.int64,  # int
-    stride_k_cache_3: tl.constexpr,  # int
-    stride_v_cache_0: tl.int64,  # int
-    stride_v_cache_1: tl.int64,  # int
-    stride_v_cache_2: tl.int64,  # int
-    stride_v_cache_3: tl.constexpr,  # int
-    query_start_len_ptr,  # [num_seqs+1]
-    BLOCK_Q: tl.constexpr,  # int
-    num_seqs: tl.int32,
+        output_ptr,  # [num_tokens, num_query_heads, head_size]
+        query_ptr,  # [num_tokens, num_query_heads, head_size]
+        key_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+        value_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+        seq_lens_ptr,  # [num_seqs]
+        alibi_slopes_ptr,  # [num_query_heads]
+        scale,  # float32
+        k_scale,  # float32
+        v_scale,  # float32
+        softcap,  # float32
+        num_query_heads: tl.constexpr,  # int
+        num_queries_per_kv: tl.constexpr,  # int
+        block_table_stride: tl.int64,  # int
+        query_stride_0: tl.int64,  # int
+        query_stride_1: tl.int64,  # int, should be equal to head_size
+        output_stride_0: tl.int64,  # int
+        output_stride_1: tl.int64,  # int, should be equal to head_size
+        BLOCK_SIZE: tl.constexpr,  # int
+        HEAD_SIZE: tl.constexpr,  # int
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        USE_SOFTCAP: tl.constexpr,  # bool
+        SLIDING_WINDOW: tl.constexpr,  # int
+        stride_k_cache_0: tl.int64,  # int
+        stride_k_cache_1: tl.int64,  # int
+        stride_k_cache_2: tl.int64,  # int
+        stride_k_cache_3: tl.constexpr,  # int
+        stride_v_cache_0: tl.int64,  # int
+        stride_v_cache_1: tl.int64,  # int
+        stride_v_cache_2: tl.int64,  # int
+        stride_v_cache_3: tl.constexpr,  # int
+        query_start_len_ptr,  # [num_seqs+1]
+        BLOCK_Q: tl.constexpr,  # int
+        num_seqs: tl.int32,
+        BLOCK_M: tl.constexpr,  # int
 ):
 
     q_block_global_idx = tl.program_id(0)
@@ -94,15 +95,13 @@ def kernel_unified_attention_2d(
     if q_block_local_idx * BLOCK_Q >= cur_batch_query_len:
         return
 
-    offs_m = tl.arange(0, BLOCK_Q * num_queries_per_kv)
+    offs_m = tl.arange(0, BLOCK_M)
     offs_d = tl.arange(0, HEAD_SIZE_PADDED)
-
     query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
 
     query_offset_0 = cur_batch_in_all_start_index + query_pos
     query_offset_1 = kv_head_idx * num_queries_per_kv + \
         offs_m % num_queries_per_kv
-
     query_offset = (query_offset_0[:, None] * query_stride_0 +
                     query_offset_1[:, None] * query_stride_1 + offs_d[None, :])
 
@@ -110,7 +109,7 @@ def kernel_unified_attention_2d(
     query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1)
     query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1)
 
-    # Q : (BLOCK_Q * num_queries_per_kv, HEAD_SIZE,)
+    # Q : (BLOCK_M, HEAD_SIZE_PADDED)
     Q = tl.load(
         query_ptr + query_offset,
         mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
@@ -119,12 +118,9 @@ def kernel_unified_attention_2d(
 
     block_table_offset = seq_idx * block_table_stride
 
-    M = tl.full([BLOCK_Q * num_queries_per_kv],
-                float("-inf"),
-                dtype=tl.float32)
-    L = tl.full([BLOCK_Q * num_queries_per_kv], 1.0, dtype=tl.float32)
-    acc = tl.zeros([BLOCK_Q * num_queries_per_kv, HEAD_SIZE_PADDED],
-                   dtype=tl.float32)
+    M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    L = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32)
 
     # sequence len for this particular sequence
     seq_len = tl.load(seq_lens_ptr + seq_idx)
@@ -183,13 +179,12 @@ def kernel_unified_attention_2d(
         else:
             V = V_load
 
-        seq_offset = j * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        seq_offset = j * BLOCK_SIZE + offs_n
 
         seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
 
-        # S : (BLOCK_Q * num_queries_per_kv, BLOCK_SIZE,)
-        S = tl.zeros(shape=(BLOCK_Q * num_queries_per_kv, BLOCK_SIZE),
-                     dtype=tl.float32)
+        # S : (BLOCK_M, BLOCK_SIZE)
+        S = tl.zeros(shape=(BLOCK_M, BLOCK_SIZE), dtype=tl.float32)
 
         S += scale * tl.dot(Q, K)
 
@@ -207,29 +202,29 @@ def kernel_unified_attention_2d(
             S += alibi_slope[:, None] * (seq_offset - context_len)
 
         # compute running maximum
-        # m_j : (BLOCK_Q * num_queries_per_kv,)
+        # m_j : (BLOCK_M,)
         m_j = tl.maximum(M, tl.max(S, axis=1))
         # For sliding window there's a chance the max is -inf due to masking of
         # the entire row. In this case we need to set m_j 0 to avoid NaN
         m_j = tl.where(m_j > float("-inf"), m_j, 0.0)
 
-        # P : (BLOCK_Q * num_queries_per_kv, BLOCK_SIZE,)
+        # P : (BLOCK_M, BLOCK_SIZE)
         P = tl.exp(S - m_j[:, None])
 
-        # l_j : (BLOCK_Q * num_queries_per_kv,)
+        # l_j : (BLOCK_M,)
         l_j = tl.sum(P, axis=1)
 
-        # alpha : (BLOCK_Q * num_queries_per_kv, )
+        # alpha : (BLOCK_M, )
         alpha = tl.exp(M - m_j)
 
-        # acc : (BLOCK_Q * num_queries_per_kv, BLOCK_SIZE,)
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
         acc = acc * alpha[:, None]
 
         # update constants
         L = L * alpha + l_j
         M = m_j
 
-        # acc : (BLOCK_Q * num_queries_per_kv, BLOCK_SIZE,)
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
         acc += tl.dot(P.to(V.dtype), V)
 
     # epilogue
@@ -334,4 +329,5 @@ def unified_attention(
         query_start_len_ptr=cu_seqlens_q,
         BLOCK_Q=BLOCK_Q,
         num_seqs=num_seqs,
+        BLOCK_M=BLOCK_M,
     )
-- 
GitLab


From 515b413ebf8feb8093556dbef21a02f9f6f877cc Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Wed, 28 May 2025 23:16:17 -0300
Subject: [PATCH 032/274] Prevent the cross-encoder logic from being applied to
 classification tasks (#18838)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/model_executor/layers/pooler.py     | 34 ++++++++++++++----------
 vllm/model_executor/models/bert.py       |  6 ++---
 vllm/model_executor/models/modernbert.py |  7 ++---
 vllm/model_executor/models/roberta.py    |  6 +++--
 vllm/transformers_utils/config.py        | 11 ++++++--
 5 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 6abbc9081..d2c42191b 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -6,10 +6,9 @@ from typing import Optional, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import PretrainedConfig
 from typing_extensions import assert_never
 
-from vllm.config import PoolerConfig
+from vllm.config import ModelConfig, PoolerConfig
 from vllm.model_executor.pooling_metadata import (PoolingMetadata,
                                                   PoolingTensors)
 from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
@@ -283,30 +282,37 @@ class Pooler(nn.Module):
         )
 
 
-class CrossEncodingPooler(nn.Module):
-    """A layer that pools specific information from hidden states.
+class ClassifierPooler(nn.Module):
+    """A pooling layer for classification tasks.
 
     This layer does the following:
-    1. Extracts specific tokens or aggregates data based on pooling method.
-    2. Normalizes output if specified.
-    3. Returns structured results as `PoolerOutput`.
-
-    Attributes:
-        pooling_type: The type of pooling to use.
-        normalize: Whether to normalize the pooled data.
+    1. Applies a classification layer to the hidden states.
+    2. Optionally applies a pooler layer.
+    3. Applies an activation function to the output. In the case of
+       classification models it is either sigmoid or softmax. In the
+       case of scoring models, the same behavior is configuration
+       dependent, as in the sentence-transformers library.
     """
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: ModelConfig,
         classifier: nn.Module,
         pooler: Optional[nn.Module] = None,
     ):
         super().__init__()
         self.classifier = classifier
         self.pooler = pooler
-        self.default_activation_function = \
-            get_cross_encoder_activation_function(config)
+
+        if config.task == "score":
+            self.default_activation_function = \
+                get_cross_encoder_activation_function(config.hf_config)
+        elif config.task == "classify":
+            self.default_activation_function = nn.Sigmoid() \
+                if config.hf_config.num_labels == 1 else nn.Softmax()
+        else:
+            raise NotImplementedError(f"task={config.task!r} is not supported"
+                                      " with the classification pooler")
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 0c6593bbe..0b1d0f103 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -16,7 +16,7 @@ from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
-from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler,
+from vllm.model_executor.layers.pooler import (ClassifierPooler, Pooler,
                                                PoolingType)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -470,8 +470,8 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding,
                               embedding_class=BertEmbedding,
                               add_pooling_layer=True)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-        self._pooler = CrossEncodingPooler(config, self.classifier,
-                                           self.bert.pooler)
+        self._pooler = ClassifierPooler(vllm_config.model_config,
+                                        self.classifier, self.bert.pooler)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
 
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 86552aa05..18eab6051 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -12,7 +12,7 @@ from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                RowParallelLinear)
-from vllm.model_executor.layers.pooler import CrossEncodingPooler
+from vllm.model_executor.layers.pooler import ClassifierPooler
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
@@ -278,8 +278,9 @@ class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding):
         self.model = ModernBertModel(vllm_config=vllm_config,
                                      prefix=maybe_prefix(prefix, "modernbert"))
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-        self._pooler = CrossEncodingPooler(config, self.classifier,
-                                           ModernBertPooler(config))
+        self._pooler = ClassifierPooler(vllm_config.model_config,
+                                        self.classifier,
+                                        ModernBertPooler(config))
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
 
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 9a4d0ab2d..76008b729 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -9,7 +9,7 @@ from torch import nn
 from transformers import RobertaConfig
 
 from vllm.config import VllmConfig
-from vllm.model_executor.layers.pooler import CrossEncodingPooler
+from vllm.model_executor.layers.pooler import ClassifierPooler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -186,7 +186,9 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
                                  embedding_class=RobertaEmbedding,
                                  add_pooling_layer=False)
         self.classifier = RobertaClassificationHead(config)
-        self._pooler = CrossEncodingPooler(config, self.classifier)
+
+        self._pooler = ClassifierPooler(vllm_config.model_config,
+                                        self.classifier)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         bert_weights, task_weights = roberta_task_weights_filter(weights)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 69e7207cc..2ed71a4d3 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -823,10 +823,17 @@ def try_get_generation_config(
 
 
 def get_cross_encoder_activation_function(config: PretrainedConfig):
-    if (hasattr(config, "sbert_ce_default_activation_function")
-            and config.sbert_ce_default_activation_function is not None):
 
+    function_name: Optional[str] = None
+    if hasattr(config, "sentence_transformers") and "activation_fn" in \
+        config.sentence_transformers:
+        function_name = config.sentence_transformers["activation_fn"]
+
+    elif (hasattr(config, "sbert_ce_default_activation_function")
+          and config.sbert_ce_default_activation_function is not None):
         function_name = config.sbert_ce_default_activation_function
+
+    if function_name is not None:
         assert function_name.startswith("torch.nn.modules."), \
             "Loading of activation functions is restricted to " \
             "torch.nn.modules for security reasons"
-- 
GitLab


From 26b4fa45bead5d65d4e15bfaffaa52ac71bea270 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Wed, 28 May 2025 22:16:52 -0400
Subject: [PATCH 033/274] Add ability to use CUDAGraphs with use_inductor=False
 (#17345)

Signed-off-by: rzou <zou3519@gmail.com>
---
 tests/compile/piecewise/test_simple.py    | 11 ++++++-
 tests/compile/piecewise/test_toy_llama.py | 40 +++++++++++++++++++----
 vllm/compilation/compiler_interface.py    |  4 +++
 vllm/compilation/counter.py               |  4 +++
 vllm/config.py                            |  5 ---
 5 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 143cb4969..5ce520a44 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -74,11 +74,12 @@ class SillyModel(nn.Module):
         return x
 
 
-def test_simple_piecewise_compile():
+def _test_simple_piecewise_compile(*, use_inductor):
 
     vllm_config = VllmConfig(compilation_config=CompilationConfig(
         level=CompilationLevel.PIECEWISE,
         use_cudagraph=True,
+        use_inductor=use_inductor,
         splitting_ops=["silly.attention"],
         cudagraph_copy_inputs=True,
         cudagraph_capture_sizes=[1, 2],
@@ -108,3 +109,11 @@ def test_simple_piecewise_compile():
         output = model(input)
         assert global_counter == 2
         assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
+
+
+def test_simple_piecewise_compile_inductor():
+    _test_simple_piecewise_compile(use_inductor=True)
+
+
+def test_simple_piecewise_compile_no_inductor():
+    _test_simple_piecewise_compile(use_inductor=False)
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index d4551b1cc..22560befc 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -261,12 +261,14 @@ def tractable_computation(input_ids: torch.Tensor,
 @torch.inference_mode
 def run_model(llama_config,
               use_compile: bool,
+              use_inductor: bool,
               split_attn: bool = False) -> torch.Tensor:
 
     if use_compile:
         compilation_config = CompilationConfig(
             level=CompilationLevel.PIECEWISE,
             use_cudagraph=True,
+            use_inductor=use_inductor,
             cudagraph_capture_sizes=[1, 2],
         )
         if split_attn:
@@ -304,7 +306,7 @@ def run_model(llama_config,
         return output.cpu()
 
 
-def test_toy_llama():
+def _test_toy_llama(*, use_inductor):
     # compare output with and without piecewise compilation
 
     llama_config = LlamaConfig(hidden_size=128,
@@ -326,8 +328,14 @@ def test_toy_llama():
             num_backend_compilations=0,
             num_cudagraph_caputured=0,
     ):
-        outputs.append(run_model(llama_config, use_compile=False))
-    run_model(tractable_config, use_compile=False)
+        outputs.append(
+            run_model(llama_config, use_inductor=False, use_compile=False))
+    run_model(tractable_config, use_inductor=False, use_compile=False)
+
+    if use_inductor:
+        kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
+    else:
+        kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
 
     with compilation_counter.expect(
             num_graphs_seen=1,  # one graph for the model
@@ -336,9 +344,13 @@ def test_toy_llama():
             num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
             num_cudagraph_caputured=
             2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+            **kwargs,
     ):
-        outputs.append(run_model(llama_config, use_compile=True))
-    run_model(tractable_config, use_compile=True)
+        outputs.append(
+            run_model(llama_config,
+                      use_inductor=use_inductor,
+                      use_compile=True))
+    run_model(tractable_config, use_inductor=use_inductor, use_compile=True)
 
     with compilation_counter.expect(
             num_graphs_seen=1,  # one graph for the model
@@ -353,13 +365,27 @@ def test_toy_llama():
          ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
         outputs.append(
-            run_model(llama_config, use_compile=True, split_attn=True))
-    run_model(tractable_config, use_compile=True, split_attn=True)
+            run_model(llama_config,
+                      use_inductor=use_inductor,
+                      use_compile=True,
+                      split_attn=True))
+    run_model(tractable_config,
+              use_inductor=use_inductor,
+              use_compile=True,
+              split_attn=True)
 
     for i in range(1, len(outputs)):
         assert torch.allclose(outputs[0], outputs[i])
 
 
+def test_toy_llama_inductor():
+    _test_toy_llama(use_inductor=True)
+
+
+def test_toy_no_inductor():
+    _test_toy_llama(use_inductor=False)
+
+
 @torch.inference_mode
 def benchmark():
     from triton.testing import do_bench
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 21af5eb76..7e9186f86 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -12,6 +12,7 @@ import torch._inductor.compile_fx
 import torch.fx as fx
 
 import vllm.envs as envs
+from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig
 from vllm.utils import is_torch_equal_or_newer
 
@@ -175,6 +176,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
         runtime_shape: Optional[int] = None,
         key: Optional[str] = None,
     ) -> tuple[Optional[Callable], Optional[Any]]:
+        compilation_counter.num_inductor_compiles += 1
         current_config = {}
         if compiler_config is not None:
             current_config.update(compiler_config)
@@ -262,6 +264,7 @@ class InductorAdaptor(CompilerInterface):
         runtime_shape: Optional[int] = None,
         key: Optional[str] = None,
     ) -> tuple[Optional[Callable], Optional[Any]]:
+        compilation_counter.num_inductor_compiles += 1
         from torch._inductor.compile_fx import compile_fx
         current_config = {}
         if compiler_config is not None:
@@ -528,6 +531,7 @@ class EagerAdaptor(CompilerInterface):
         runtime_shape: Optional[int] = None,
         key: Optional[str] = None,
     ) -> tuple[Optional[Callable], Optional[Any]]:
+        compilation_counter.num_eager_compiles += 1
         # we don't need to compile the graph, just return the graph itself.
         # It does not support caching, return None for the handle.
         return graph, None
diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
index 5be452593..2200671b8 100644
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -15,6 +15,10 @@ class CompilationCounter:
     num_piecewise_capturable_graphs_seen: int = 0
     num_backend_compilations: int = 0
     num_cudagraph_caputured: int = 0
+    # InductorAdapter.compile calls
+    num_inductor_compiles: int = 0
+    # EagerAdapter.compile calls
+    num_eager_compiles: int = 0
 
     def clone(self) -> "CompilationCounter":
         return copy.deepcopy(self)
diff --git a/vllm/config.py b/vllm/config.py
index fe2ad70f5..3172cbe45 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4315,15 +4315,10 @@ class VllmConfig:
             self.compilation_config.custom_ops.append("+rms_norm")
         if envs.VLLM_USE_V1 and self.model_config is not None and \
             not self.model_config.enforce_eager:
-            # NOTE(woosuk): Currently, we use inductor because the piecewise
-            # CUDA graphs do not work properly with the custom CUDA kernels.
-            # FIXME(woosuk): Disable inductor to reduce the compilation time
-            # and avoid any potential issues with the inductor.
             # FIXME(rob): Add function to set all of these.
             if not self.compilation_config.custom_ops:
                 self.compilation_config.custom_ops = ["none"]
             self.compilation_config.use_cudagraph = True
-            self.compilation_config.use_inductor = True
             self.compilation_config.cudagraph_num_of_warmups = 1
             self.compilation_config.pass_config.enable_fusion = False
             self.compilation_config.pass_config.enable_noop = False
-- 
GitLab


From 8e882ffdc0b4e9e3e6fc3fabe1c199fb6b8736a2 Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Wed, 28 May 2025 19:34:19 -0700
Subject: [PATCH 034/274] [Bugfix][TPU] fix moe custom kernel import (#18853)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 vllm/model_executor/layers/fused_moe/moe_pallas.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
index babeb9730..539459992 100644
--- a/vllm/model_executor/layers/fused_moe/moe_pallas.py
+++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py
@@ -2,6 +2,7 @@
 
 import torch
 import torch.nn.functional as F
+import torch_xla.experimental.custom_kernel  # noqa: F401
 
 
 def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor:
-- 
GitLab


From 1661a9c28f090505fe127ca4cb82678486a523cc Mon Sep 17 00:00:00 2001
From: aws-elaineyz <elaineyz@amazon.com>
Date: Wed, 28 May 2025 19:44:01 -0700
Subject: [PATCH 035/274] [Doc][Neuron] Update documentation for Neuron
 (#18868)

Signed-off-by: Elaine Zhao <elaineyz@amazon.com>
---
 docs/features/compatibility_matrix.md         |   3 +
 .../quantization/supported_hardware.md        |   6 +-
 .../installation/ai_accelerator/neuron.inc.md | 184 +++++++++---------
 vllm/config.py                                |   2 +-
 4 files changed, 100 insertions(+), 95 deletions(-)

diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md
index 77ceea49f..71882d317 100644
--- a/docs/features/compatibility_matrix.md
+++ b/docs/features/compatibility_matrix.md
@@ -75,3 +75,6 @@ th:not(:first-child) {
 | multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8477) | ✅     |
 | best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
 | beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
+
+!!! note
+    Please refer to [Feature support through NxD Inference backend][feature-support-through-nxd-inference-backend] for features supported on AWS Neuron hardware
diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md
index 2967bf9c7..6a585b1cc 100644
--- a/docs/features/quantization/supported_hardware.md
+++ b/docs/features/quantization/supported_hardware.md
@@ -5,13 +5,13 @@ title: Supported Hardware
 
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
 
-| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | x86 CPU   | AWS Inferentia   | Google TPU   |
+| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | x86 CPU   | AWS Neuron   | Google TPU   |
 |-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-----------|------------------|--------------|
 | AWQ                   | ❌       | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ✅︎        | ❌                | ❌            |
 | GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ✅︎        | ❌                | ❌            |
 | Marlin (GPTQ/AWQ/FP8) | ❌       | ❌        | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
-| INT8 (W8A8)           | ❌       | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ✅︎        | ❌                | ✅︎           |
-| FP8 (W8A8)            | ❌       | ❌        | ❌        | ✅︎    | ✅︎       | ✅︎        | ❌           | ❌         | ❌                | ❌            |
+| INT8 (W8A8)           | ❌       | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ✅︎        | ✅︎                | ✅︎           |
+| FP8 (W8A8)            | ❌       | ❌        | ❌        | ✅︎    | ✅︎       | ✅︎        | ❌           | ❌         | ✅︎                | ❌            |
 | BitBLAS (GPTQ)        | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
 | AQLM                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
 | bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
diff --git a/docs/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/getting_started/installation/ai_accelerator/neuron.inc.md
index f08c78fba..86c12472f 100644
--- a/docs/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -1,8 +1,9 @@
 # --8<-- [start:installation]
 
-vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
-Paged Attention and Chunked Prefill are currently in development and will be available soon.
-Data types currently supported in Neuron SDK are FP16 and BF16.
+[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the software development kit (SDK) used to run deep learning and 
+    generative AI workloads on AWS Inferentia and AWS Trainium powered Amazon EC2 instances and UltraServers (Inf1, Inf2, Trn1, Trn2, 
+    and Trn2 UltraServer). Both Trainium and Inferentia are powered by fully-independent heterogeneous compute-units called NeuronCores. 
+    This tab describes how to set up your environment to run vLLM on Neuron.
 
 !!! warning
     There are no pre-built wheels or images for this device, so you must build vLLM from source.
@@ -11,58 +12,30 @@ Data types currently supported in Neuron SDK are FP16 and BF16.
 # --8<-- [start:requirements]
 
 - OS: Linux
-- Python: 3.9 -- 3.11
-- Accelerator: NeuronCore_v2 (in trn1/inf2 instances)
-- Pytorch 2.0.1/2.1.1
-- AWS Neuron SDK 2.16/2.17 (Verified on python 3.8)
+- Python: 3.9 or newer
+- Pytorch 2.5/2.6
+- Accelerator: NeuronCore-v2 (in trn1/inf2 chips) or NeuronCore-v3 (in trn2 chips)
+- AWS Neuron SDK 2.23
 
 ## Configure a new environment
 
-### Launch Trn1/Inf2 instances
+### Launch a Trn1/Trn2/Inf2 instance and verify Neuron dependencies
 
-Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html).
+The easiest way to launch a Trainium or Inferentia instance with pre-installed Neuron dependencies is to follow this 
+[quick start guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/multiframework/multi-framework-ubuntu22-neuron-dlami.html#setup-ubuntu22-multi-framework-dlami) using the Neuron Deep Learning AMI (Amazon machine image).
 
-- Please follow the instructions at [launch an Amazon EC2 Instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance) to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type.
-- To get more information about instances sizes and pricing see: [Trn1 web page](https://aws.amazon.com/ec2/instance-types/trn1/), [Inf2 web page](https://aws.amazon.com/ec2/instance-types/inf2/)
-- Select Ubuntu Server 22.04 TLS AMI
-- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB.
 - After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
-
-### Install drivers and tools
-
-The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below:
-
+- Once inside your instance, activate the pre-installed virtual environment for inference by running
 ```console
-# Configure Linux for Neuron repository updates
-. /etc/os-release
-sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
-deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
-EOF
-wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB \
-    | sudo apt-key add -
-
-# Update OS packages
-sudo apt-get update -y
-
-# Install OS headers
-sudo apt-get install linux-headers-$(uname -r) -y
-
-# Install git
-sudo apt-get install git -y
-
-# install Neuron Driver
-sudo apt-get install aws-neuronx-dkms=2.* -y
-
-# Install Neuron Runtime
-sudo apt-get install aws-neuronx-collectives=2.* -y
-sudo apt-get install aws-neuronx-runtime-lib=2.* -y
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
 
-# Install Neuron Tools
-sudo apt-get install aws-neuronx-tools=2.* -y
+Refer to the [NxD Inference Setup Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/nxdi-setup.html) 
+for alternative setup instructions including using Docker and manually installing dependencies.
 
-# Add PATH
-export PATH=/opt/aws/neuron/bin:$PATH
-```
+!!! note
+    NxD Inference is the default recommended backend to run inference on Neuron. If you are looking to use the legacy [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) 
+    library, refer to [Transformers NeuronX Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/setup/index.html).  
 
 # --8<-- [end:requirements]
 # --8<-- [start:set-up-using-python]
@@ -75,60 +48,37 @@ Currently, there are no pre-built Neuron wheels.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
-!!! note
-    The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
-
-Following instructions are applicable to Neuron SDK 2.16 and beyond.
-
-#### Install transformers-neuronx and its dependencies
+#### Install vLLM from source
 
-[transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) will be the backend to support inference on trn1/inf2 instances.
-Follow the steps below to install transformer-neuronx package and its dependencies.
+Install vllm as follows:
 
 ```console
-# Install Python venv
-sudo apt-get install -y python3.10-venv g++
-
-# Create Python venv
-python3.10 -m venv aws_neuron_venv_pytorch
-
-# Activate Python venv
-source aws_neuron_venv_pytorch/bin/activate
-
-# Install Jupyter notebook kernel
-pip install ipykernel
-python3.10 -m ipykernel install \
-    --user \
-    --name aws_neuron_venv_pytorch \
-    --display-name "Python (torch-neuronx)"
-pip install jupyter notebook
-pip install environment_kernels
-
-# Set pip repository pointing to the Neuron repository
-python -m pip config set \
-    global.extra-index-url \
-    https://pip.repos.neuron.amazonaws.com
-
-# Install wget, awscli
-python -m pip install wget
-python -m pip install awscli
-
-# Update Neuron Compiler and Framework
-python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+pip install -U -r requirements/neuron.txt
+VLLM_TARGET_DEVICE="neuron" pip install -e .
 ```
 
-#### Install vLLM from source
+AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at 
+    [https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2), which contains several features in addition to what's 
+    available on vLLM V0. Please utilize the AWS Fork for the following features:
+
+- Llama-3.2 multi-modal support
+- Multi-node distributed inference 
 
-Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows:
+Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html) 
+    for more details and usage examples.
+
+To install the AWS Neuron fork, run the following:
 
 ```console
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-pip install -U -r requirements/neuron.txt
-VLLM_TARGET_DEVICE="neuron" pip install .
+git clone -b neuron-2.23-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git
+cd upstreaming-to-vllm
+pip install -r requirements/neuron.txt
+VLLM_TARGET_DEVICE="neuron" pip install -e .
 ```
 
-If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed.
+Note that the AWS Neuron fork is only intended to support Neuron hardware; compatibility with other hardwares is not tested.
 
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:set-up-using-docker]
@@ -148,5 +98,57 @@ Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dock
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:extra-information]
 
-There is no extra information for this device.
+[](){ #feature-support-through-nxd-inference-backend }
+### Feature support through NxD Inference backend
+
+The current vLLM and Neuron integration relies on either the `neuronx-distributed-inference` (preferred) or `transformers-neuronx` backend 
+    to perform most of the heavy lifting which includes PyTorch model initialization, compilation, and runtime execution. Therefore, most 
+    [features supported on Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html) are also available via the vLLM integration. 
+
+To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override 
+as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include
+```console
+override_neuron_config={
+    "enable_bucketing":False,
+}
+```
+or when launching vLLM from the CLI, pass
+```console
+--override-neuron-config "{\"enable_bucketing\":false}"
+```
+
+Alternatively, users can directly call the NxDI library to trace and compile your model, then load the pre-compiled artifacts 
+(via `NEURON_COMPILED_ARTIFACTS` environment variable) in vLLM to run inference workloads. 
+
+### Known limitations
+
+- EAGLE speculative decoding: NxD Inference requires the EAGLE draft checkpoint to include the LM head weights from the target model. Refer to this
+    [guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html#eagle-checkpoint-compatibility)
+    for how to convert pretrained EAGLE model checkpoints to be compatible for NxDI.
+- Quantization: the native quantization flow in vLLM is not well supported on NxD Inference. It is recommended to follow this 
+    [Neuron quantization guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/custom-quantization.html) 
+    to quantize and compile your model using NxD Inference, and then load the compiled artifacts into vLLM.
+- Multi-LoRA serving: NxD Inference only supports loading of LoRA adapters at server startup. Dynamic loading of LoRA adapters at 
+    runtime is not currently supported. Refer to [multi-lora example](https://github.com/aws-neuron/upstreaming-to-vllm/blob/neuron-2.23-vllm-v0.7.2/examples/offline_inference/neuron_multi_lora.py)
+- Multi-modal support: multi-modal support is only available through the AWS Neuron fork. This feature has not been upstreamed
+    to vLLM main because NxD Inference currently relies on certain adaptations to the core vLLM logic to support this feature.
+- Multi-node support: distributed inference across multiple Trainium/Inferentia instances is only supported on the AWS Neuron fork. Refer
+    to this [multi-node example](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2/examples/neuron/multi_node)
+    to run. Note that tensor parallelism (distributed inference across NeuronCores) is available in vLLM main.
+- Known edge case bug in speculative decoding: An edge case failure may occur in speculative decoding when sequence length approaches 
+    max model length (e.g. when requesting max tokens up to the max model length and ignoring eos). In this scenario, vLLM may attempt 
+    to allocate an additional block to ensure there is enough memory for number of lookahead slots, but since we do not have good support 
+    for paged attention, there isn't another Neuron block for vLLM to allocate. A workaround fix (to terminate 1 iteration early) is 
+    implemented in the AWS Neuron fork but is not upstreamed to vLLM main as it modifies core vLLM logic.
+
+
+### Environment variables
+- `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid 
+    compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the
+    artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set,
+    but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts
+    under this specified path.
+- `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend).
+- `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend).
+
 # --8<-- [end:extra-information]
diff --git a/vllm/config.py b/vllm/config.py
index 3172cbe45..085060535 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -380,7 +380,7 @@ class ModelConfig:
     """Initialize non-default neuron config or override default neuron config
     that are specific to Neuron devices, this argument will be used to
     configure the neuron config that can not be gathered from the vllm
-    arguments. e.g. `{"cast_logits_dtype": "bloat16"}`."""
+    arguments. e.g. `{"cast_logits_dtype": "bfloat16"}`."""
     pooler_config: Optional["PoolerConfig"] = field(init=False)
     """Pooler config which controls the behaviour of output pooling in pooling
     models."""
-- 
GitLab


From 3c49dbdd03f33fb938bc67230dbc2e8f536ed490 Mon Sep 17 00:00:00 2001
From: Yikun Jiang <yikunkero@gmail.com>
Date: Thu, 29 May 2025 11:12:30 +0800
Subject: [PATCH 036/274] Skip device and quant Pydantic validation to make
 plugin device work (#18843)

Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
---
 vllm/config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 085060535..6cec97a5f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -304,7 +304,7 @@ class ModelConfig:
     - 25.6k -> 25,600"""
     spec_target_max_model_len: Optional[int] = None
     """Specify the maximum length for spec decoding draft models."""
-    quantization: Optional[QuantizationMethods] = None
+    quantization: SkipValidation[Optional[QuantizationMethods]] = None
     """Method used to quantize the weights. If `None`, we first check the
     `quantization_config` attribute in the model config file. If that is
     `None`, we assume the model weights are not quantized and use `dtype` to
@@ -2231,7 +2231,7 @@ Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"]
 class DeviceConfig:
     """Configuration for the device to use for vLLM execution."""
 
-    device: Union[Device, torch.device] = "auto"
+    device: SkipValidation[Union[Device, torch.device]] = "auto"
     """Device type for vLLM execution.
     This parameter is deprecated and will be 
     removed in a future release. 
-- 
GitLab


From fd7bb88d72ba721d6eb4f9d34198ad930c36c177 Mon Sep 17 00:00:00 2001
From: Brent Salisbury <bsalisbu@redhat.com>
Date: Thu, 29 May 2025 00:41:39 -0400
Subject: [PATCH 037/274] Fixes a dead link in nightly benchmark readme
 (#18856)

Signed-off-by: Brent Salisbury <bsalisbu@redhat.com>
---
 .buildkite/nightly-benchmarks/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index d3f5fc5cd..72c52d5bb 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
 
 ### Visualizing the results
 
-The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
+The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
 You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
-- 
GitLab


From 972eddf7c9cee072bb1f92618072782a4ed0b7c0 Mon Sep 17 00:00:00 2001
From: Satyajith Chilappagari <satchill@amazon.com>
Date: Thu, 29 May 2025 01:41:22 -0700
Subject: [PATCH 038/274] [Neuron] Add multi-LoRA support for Neuron. (#18284)

Signed-off-by: Satyajith Chilappagari <satchill@amazon.com>
---
 tests/neuron/2_core/test_multi_lora.py        |  98 +++++++++++
 .../model_loader/neuronx_distributed.py       |  31 ++--
 vllm/platforms/neuron.py                      |   3 -
 vllm/worker/neuron_model_runner.py            |  32 +++-
 vllm/worker/neuron_worker.py                  |  42 ++++-
 .../neuronx_distributed_model_runner.py       | 163 +++++++++++++++++-
 6 files changed, 343 insertions(+), 26 deletions(-)
 create mode 100644 tests/neuron/2_core/test_multi_lora.py

diff --git a/tests/neuron/2_core/test_multi_lora.py b/tests/neuron/2_core/test_multi_lora.py
new file mode 100644
index 000000000..6fa8f9128
--- /dev/null
+++ b/tests/neuron/2_core/test_multi_lora.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from huggingface_hub import snapshot_download
+
+from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
+
+
+def test_llama_single_lora():
+    sql_lora_files = snapshot_download(
+        repo_id="yard1/llama-2-7b-sql-lora-test")
+    llm = LLM(model="meta-llama/Llama-2-7b-hf",
+              tensor_parallel_size=2,
+              max_num_seqs=4,
+              max_model_len=512,
+              use_v2_block_manager=True,
+              override_neuron_config={
+                  "sequence_parallel_enabled": False,
+                  "skip_warmup": True,
+                  "lora_modules": [{
+                      "name": "lora_id_1",
+                      "path": sql_lora_files
+                  }]
+              },
+              enable_lora=True,
+              max_loras=1,
+              max_lora_rank=256,
+              device="neuron")
+    """For multi-lora requests using NxDI as the backend, only the lora_name 
+    needs to be specified. The lora_id and lora_path are supplied at the LLM 
+    class/server initialization, after which the paths are handled by NxDI"""
+    lora_req_1 = LoRARequest("lora_id_1", 0, " ")
+    prompts = [
+        "The president of the United States is",
+        "The capital of France is",
+    ]
+    outputs = llm.generate(prompts,
+                           SamplingParams(top_k=1),
+                           lora_request=[lora_req_1, lora_req_1])
+
+    expected_outputs = [
+        " the head of state and head of government of the United States. "
+        "The president direct",
+        " a city of contrasts. The city is home to the Eiffel Tower"
+    ]
+
+    for expected_output, output in zip(expected_outputs, outputs):
+        generated_text = output.outputs[0].text
+        assert (expected_output == generated_text)
+
+
+def test_llama_multiple_lora():
+    sql_lora_files = snapshot_download(
+        repo_id="yard1/llama-2-7b-sql-lora-test")
+    llm = LLM(model="meta-llama/Llama-2-7b-hf",
+              tensor_parallel_size=2,
+              max_num_seqs=4,
+              max_model_len=512,
+              use_v2_block_manager=True,
+              override_neuron_config={
+                  "sequence_parallel_enabled":
+                  False,
+                  "skip_warmup":
+                  True,
+                  "lora_modules": [{
+                      "name": "lora_id_1",
+                      "path": sql_lora_files
+                  }, {
+                      "name": "lora_id_2",
+                      "path": sql_lora_files
+                  }]
+              },
+              enable_lora=True,
+              max_loras=2,
+              max_lora_rank=256,
+              device="neuron")
+    """For multi-lora requests using NxDI as the backend, only the lora_name 
+    needs to be specified. The lora_id and lora_path are supplied at the LLM 
+    class/server initialization, after which the paths are handled by NxDI"""
+    lora_req_1 = LoRARequest("lora_id_1", 0, " ")
+    lora_req_2 = LoRARequest("lora_id_2", 1, " ")
+    prompts = [
+        "The president of the United States is",
+        "The capital of France is",
+    ]
+    outputs = llm.generate(prompts,
+                           SamplingParams(top_k=1),
+                           lora_request=[lora_req_1, lora_req_2])
+
+    expected_outputs = [
+        " the head of state and head of government of the United States. "
+        "The president direct",
+        " a city of contrasts. The city is home to the Eiffel Tower"
+    ]
+
+    for expected_output, output in zip(expected_outputs, outputs):
+        generated_text = output.outputs[0].text
+        assert (expected_output == generated_text)
diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py
index 557feea46..624bd476c 100644
--- a/vllm/model_executor/model_loader/neuronx_distributed.py
+++ b/vllm/model_executor/model_loader/neuronx_distributed.py
@@ -17,6 +17,8 @@ from neuronx_distributed_inference.models.config import (
     FusedSpecNeuronConfig, OnDeviceSamplingConfig)
 from neuronx_distributed_inference.models.mllama.utils import (
     create_vision_mask)
+from neuronx_distributed_inference.modules.lora_serving import (
+    LoraServingConfig)
 from neuronx_distributed_inference.utils.hf_adapter import (
     load_pretrained_config)
 from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig
@@ -80,25 +82,26 @@ class NeuronCausalLM(nn.Module):
         # Lazy initialized
         self.model: nn.Module
 
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        input_block_ids: torch.Tensor,
-        sampling_params: torch.Tensor,
-    ) -> torch.Tensor:
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                input_block_ids: torch.Tensor,
+                sampling_params: torch.Tensor,
+                prev_hidden: Optional[torch.Tensor] = None,
+                adapter_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
         # sort block ids sequentially for perf/neuron support reasons
         sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids)
         input_ids = torch.index_select(input_ids, 0, sorted_indices)
         positions = torch.index_select(positions, 0, sorted_indices)
         sampling_params = torch.index_select(sampling_params, 0,
                                              sorted_indices)
-
         output = self.model(input_ids,
                             attention_mask=None,
                             position_ids=positions,
                             seq_ids=sorted_input_block_ids,
-                            sampling_params=sampling_params)
+                            sampling_params=sampling_params,
+                            prev_hidden=prev_hidden,
+                            adapter_ids=adapter_ids)
         # on-device sampling
         if self.config.neuron_config.on_device_sampling_config:
             output = output.hidden_states
@@ -522,7 +525,8 @@ def _get_model_architecture(config: PretrainedConfig) -> str:
 
 def _get_default_neuron_config(model_config: ModelConfig,
                                parallel_config: ParallelConfig,
-                               scheduler_config: SchedulerConfig):
+                               scheduler_config: SchedulerConfig,
+                               lora_serving_config: LoraServingConfig):
     """Generate a neuron config based on vllm config args."""
     on_device_sampling_config = OnDeviceSamplingConfig(dynamic=True,
                                                        deterministic=False)
@@ -541,7 +545,7 @@ def _get_default_neuron_config(model_config: ModelConfig,
         padding_side="right",
         on_device_sampling_config=on_device_sampling_config,
         sequence_parallel_enabled=True,
-    )
+        lora_serving_config=lora_serving_config)
     return neuron_config
 
 
@@ -581,7 +585,8 @@ def _get_neuron_config_after_override(default_neuron_config,
 
 def get_neuron_model(model_config: ModelConfig,
                      parallel_config: ParallelConfig,
-                     scheduler_config: SchedulerConfig) -> nn.Module:
+                     scheduler_config: SchedulerConfig,
+                     lora_serving_config: LoraServingConfig) -> nn.Module:
     """Initializes a neuron-optimized model for inference."""
     model_arch = _get_model_architecture(model_config.hf_config)
     if model_arch == "MllamaForConditionalGeneration":
@@ -589,7 +594,7 @@ def get_neuron_model(model_config: ModelConfig,
     else:
         model = NeuronCausalLM(model_config.hf_config)
     default_neuron_config_args = _get_default_neuron_config(
-        model_config, parallel_config, scheduler_config)
+        model_config, parallel_config, scheduler_config, lora_serving_config)
     neuron_config = _get_neuron_config_after_override(
         default_neuron_config_args, model_config.override_neuron_config)
 
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 474c70d04..56f204e71 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -49,9 +49,6 @@ class NeuronPlatform(Platform):
         if parallel_config.world_size > 1:
             parallel_config.distributed_executor_backend = "uni"
 
-        assert (vllm_config.lora_config
-                is None), "LoRA is not supported for Neuron backend."
-
         if vllm_config.cache_config and vllm_config.model_config:
             # neuron needs block_size = max_model_len
             vllm_config.cache_config.block_size = \
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 968596471..292fe57f3 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -2,13 +2,15 @@
 
 import os
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 
 from vllm.config import DeviceConfig, VllmConfig
 from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader.neuron import get_neuron_model
@@ -36,6 +38,7 @@ class ModelInputForNeuron(ModelRunnerInputBase):
     input_block_ids: Optional[torch.Tensor] = None
     sampling_metadata: SamplingMetadata = None
     multi_modal_kwargs: BatchedTensorInputs = None
+    adapter_ids: Optional[str] = None
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -80,6 +83,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
                            "The model will run without sliding window.")
         self.device_config = (self.device_config if self.device_config
                               is not None else DeviceConfig())
+        self.lora_config = vllm_config.lora_config
         self.device = self.device_config.device
         self.pin_memory = is_pin_memory_available()
 
@@ -378,6 +382,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
                 positions=model_input.input_positions,
                 input_block_ids=model_input.input_block_ids,
                 sampling_params=sampling_params,
+                adapter_ids=model_input.adapter_ids,
                 **MultiModalKwargs.as_kwargs(
                     model_input.multi_modal_kwargs or {},
                     dtype=self.model_config.dtype,
@@ -416,3 +421,28 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
     @property
     def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()
+
+    def remove_all_loras(self):
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
+                         lora_mapping: LoRAMapping) -> None:
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+
+    def add_lora(self, lora_request: LoRARequest):
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index aa8e39613..64daee31b 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """A Neuron worker class."""
 import os
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
 
 import torch.distributed
 
@@ -9,19 +9,19 @@ from vllm.config import VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
 from vllm.platforms.neuron import NeuronFramework
 from vllm.sequence import ExecuteModelRequest
 from vllm.worker.neuron_model_runner import NeuronModelRunner
-from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
-                                     LoRANotSupportedWorkerBase, WorkerBase,
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
                                      WorkerInput)
 
 logger = init_logger(__name__)
 
 
-class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
+class NeuronWorker(LocalOrDistributedWorkerBase):
     """A worker class that executes the model on a group of neuron cores.
     """
 
@@ -38,6 +38,7 @@ class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         self.rank = rank
         self.distributed_init_method = distributed_init_method
         self.is_driver_worker = is_driver_worker
+        self.lora_config = vllm_config.lora_config
 
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
@@ -59,6 +60,9 @@ class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
                 "[transformers-neuronx, neuronx-distributed-inference]")
 
     def get_tnx_model_runner(self, vllm_config):
+        assert (self.lora_config
+                is None), ("LoRA is not supported for TransformersNeuronX "
+                           "framework.")
         from vllm.worker.multi_step_neuron_model_runner import (
             MultiStepNeuronModelRunner)
         if self.speculative_config is not None:
@@ -72,6 +76,8 @@ class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         from vllm.worker.neuronx_distributed_model_runner import (
             NeuronxDistributedModelRunner)
         if self.speculative_config is not None:
+            assert (self.lora_config
+                    is None), "LoRA is not supported for Speculative Decoding"
             return MultiStepNeuronxDistributedModelRunner(
                 vllm_config=vllm_config)
         else:
@@ -156,3 +162,31 @@ class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
             1,
             1,
         )
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        if current_platform.use_transformers_neuronx():
+            raise NotImplementedError(
+                f"{type(self)} does not support LoRA with Neuron Framework "
+                f"Transformers NeuronX")
+        return self.model_runner.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        if current_platform.use_transformers_neuronx():
+            raise NotImplementedError(
+                f"{type(self)} does not support LoRA with Neuron Framework "
+                f"Transformers NeuronX")
+        return self.model_runner.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        if current_platform.use_transformers_neuronx():
+            raise NotImplementedError(
+                f"{type(self)} does not support LoRA with Neuron Framework "
+                f"Transformers NeuronX")
+        return self.model_runner.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        if current_platform.use_transformers_neuronx():
+            raise NotImplementedError(
+                f"{type(self)} does not support LoRA with Neuron Framework "
+                f"Transformers NeuronX")
+        return self.model_runner.list_loras()
diff --git a/vllm/worker/neuronx_distributed_model_runner.py b/vllm/worker/neuronx_distributed_model_runner.py
index 4e784e5e0..aa94706c8 100644
--- a/vllm/worker/neuronx_distributed_model_runner.py
+++ b/vllm/worker/neuronx_distributed_model_runner.py
@@ -1,17 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional
+from typing import List, Optional, Set
 
 import torch
 from neuronx_distributed_inference.modules.generation.sampling import (
     prepare_sampling_params)
+from neuronx_distributed_inference.modules.lora_serving import (
+    LoraCheckpoint, LoraServingConfig)
 
 from vllm.config import VllmConfig
+from vllm.entrypoints.openai.serving_models import LoRAModulePath
 from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader.neuronx_distributed import (
     _get_model_architecture, get_neuron_model)
-from vllm.sequence import IntermediateTensors
+from vllm.platforms import current_platform
+from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 from vllm.worker.neuron_model_runner import (ModelInputForNeuron,
                                              NeuronModelRunner)
 
@@ -25,11 +32,44 @@ class NeuronxDistributedModelRunner(NeuronModelRunner):
         vllm_config: VllmConfig,
     ):
         super().__init__(vllm_config)
+        self.lora_checkpoint = None
+        self.model = None
+        self.lora_serving_config = None
+
+    @staticmethod
+    def _get_lora_paths_strings(lora_modules: List[LoRAModulePath]):
+        if not lora_modules:
+            return None
+        return {_.get("name"): _.get("path") for _ in lora_modules}
+
+    def _get_nxdi_lora_config(self):
+        override_neuron_config = self.model_config.override_neuron_config
+        lora_modules = override_neuron_config.pop("lora_modules", None)
+        target_modules = override_neuron_config.pop("target_modules", None)
+        lora_ckpt_paths = self._get_lora_paths_strings(lora_modules)
+        if self.lora_config.max_loras < len(lora_ckpt_paths):
+            raise ValueError(
+                "Number of LoRAs (%s) exceeds maximum "
+                "allowed (%s)", len(lora_ckpt_paths),
+                self.lora_config.max_loras)
+
+        return LoraServingConfig(
+            max_loras=self.lora_config.max_loras,
+            max_lora_rank=self.lora_config.max_lora_rank,
+            target_modules=target_modules,
+            lora_ckpt_paths=lora_ckpt_paths,
+        )
 
     def load_model(self) -> None:
-        self.model = get_neuron_model(self.model_config,
-                                      parallel_config=self.parallel_config,
-                                      scheduler_config=self.scheduler_config)
+        # Update LoRA config
+        if self.lora_config is not None:
+            self.lora_serving_config = self._get_nxdi_lora_config()
+            self.lora_checkpoint = LoraCheckpoint(self.lora_serving_config)
+        self.model = get_neuron_model(
+            self.model_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            lora_serving_config=self.lora_serving_config)
 
     def get_nxd_sampling_params(self, sampling_metadata):
         if self.model.config.neuron_config.on_device_sampling_config:
@@ -134,3 +174,116 @@ class NeuronxDistributedModelRunner(NeuronModelRunner):
         )
 
         return [output]
+
+    def _get_lora_adapter_ids(self, seq_group_metadata_list):
+        # set LoRA adapter IDs for multi-lora serving
+        batch_size = len(seq_group_metadata_list)
+        if self.lora_checkpoint is not None:
+            # "0" indicates NxDI to use the base model for inference
+            adapter_ids = ["0"] * batch_size
+            for idx, seq_group_metadata in enumerate(seq_group_metadata_list):
+                if seq_group_metadata.lora_request is not None:
+                    adapter_ids[
+                        idx] = seq_group_metadata.lora_request.lora_name
+
+            # convert adapter_ids from strings to integers
+            adapter_ids = self.lora_checkpoint.convert_adapter_ids_to_indices(
+                adapter_ids, batch_size)
+        else:
+            adapter_ids = torch.zeros((batch_size), dtype=torch.int32)
+
+        return adapter_ids
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> ModelInputForNeuron:
+        multi_modal_kwargs = None
+        # NOTE: We assume that all sequences in the group are all prompts or
+        # all decodes.
+        is_prompt = seq_group_metadata_list[0].is_prompt
+        # Prepare input tensors.
+        if is_prompt:
+            (input_tokens, input_positions, input_block_ids, seq_lens,
+             multi_modal_kwargs
+             ) = self._prepare_prompt(seq_group_metadata_list)
+        else:
+            (input_tokens, input_positions,
+             input_block_ids) = self._prepare_decode(seq_group_metadata_list)
+            seq_lens = None
+
+        if not self._on_device_sampling_disabled:
+            for seq_group_metadata in seq_group_metadata_list:
+                sampling_params = seq_group_metadata.sampling_params
+                top_k, top_p, temperature = (
+                    self._convert_to_neuron_sampling_params(sampling_params))
+                sampling_params.top_k = top_k
+                sampling_params.top_p = top_p
+                sampling_params.temperature = temperature
+
+        lora_adapter_ids = self._get_lora_adapter_ids(seq_group_metadata_list)
+
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            # query_lens is not needed if chunked prefill is not
+            # supported. Since neuron worker doesn't support chunked prefill
+            # just use seq_lens instead.
+            seq_lens,
+            self.device,
+            self.pin_memory,
+            generators=self.get_generators(finished_requests_ids))
+
+        if current_platform.use_transformers_neuronx(
+        ) and not self._on_device_sampling_disabled:
+            # Once the request IDs are changed in current iteration, we will
+            # update the on-device sampling parameters.
+            current_batch_request_ids = [
+                seq_group_meta_data.request_id
+                for seq_group_meta_data in seq_group_metadata_list
+            ]
+            if current_batch_request_ids != self._previous_batch_request_ids:
+                self._update_neuron_sampling_params(seq_group_metadata_list)
+                self._previous_batch_request_ids = current_batch_request_ids
+
+        return ModelInputForNeuron(input_tokens=input_tokens,
+                                   input_positions=input_positions,
+                                   input_block_ids=input_block_ids,
+                                   sampling_metadata=sampling_metadata,
+                                   multi_modal_kwargs=multi_modal_kwargs,
+                                   adapter_ids=lora_adapter_ids)
+
+    def remove_all_loras(self):
+        raise NotImplementedError(
+            "Managing LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config")
+
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
+                         lora_mapping: LoRAMapping) -> None:
+        raise NotImplementedError(
+            "Managing LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config")
+
+    def add_lora(self, lora_request: LoRARequest):
+        logger.warning(
+            "Adding LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config. If you supplied "
+            "the parameter, you can ignore this warning. Ignoring"
+            "lora request: ", lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "Managing LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config")
+
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "Managing LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config")
+
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError(
+            "Managing LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config")
-- 
GitLab


From 34d6c447c4b9cf2dd986df7788684cc40662e5e7 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 29 May 2025 16:46:24 +0800
Subject: [PATCH 039/274] [LoRA] Add LoRA support for InternVL  (#18842)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/internvl.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 4612fc438..71be2b48d 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.models.intern_vit import (InternVisionModel,
                                                    InternVisionPatchModel)
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import convert_image_mode
@@ -36,7 +37,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
@@ -1014,7 +1016,17 @@ class InternVLMultiModalProcessor(
     InternVLMultiModalProcessor,
     info=InternVLProcessingInfo,
     dummy_inputs=InternVLDummyInputsBuilder)
-class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
+class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
+                        SupportsLoRA):
+
+    packed_modules_mapping = {
+        "wqkv": ["wqkv"],
+        "qkv": ["qkv"],
+        "gate_up_proj": [
+            "w1",
+            "w3",
+        ],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
@@ -1403,3 +1415,12 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         ]
         loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
         return loader.load_weights(weights)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="mlp1",
+            tower_model="vision_model")
-- 
GitLab


From a652e71dd0ed2cfd951205bd836c069aeec34380 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Thu, 29 May 2025 17:51:20 +0800
Subject: [PATCH 040/274] [Doc] Remove redundant spaces from
 compatibility_matrix.md (#18891)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/features/compatibility_matrix.md | 35 ++++++++++++++-------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md
index 71882d317..5d448eb5c 100644
--- a/docs/features/compatibility_matrix.md
+++ b/docs/features/compatibility_matrix.md
@@ -10,6 +10,7 @@ The symbols used have the following meanings:
 - ✅ = Full compatibility
 - 🟠 = Partial compatibility
 - ❌ = No compatibility
+- ❔ = Unknown or TBD
 
 !!! note
     Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination.
@@ -36,23 +37,23 @@ th:not(:first-child) {
 }
 </style>
 
-| Feature                                                   | [CP][chunked-prefill]   | [APC][automatic-prefix-caching]   | [LoRA][lora-adapter]   | <abbr title="Prompt Adapter">prmpt adptr</abbr>   | [SD][spec-decode]   | CUDA graph   | <abbr title="Pooling Models">pooling</abbr>   | <abbr title="Encoder-Decoder Models">enc-dec</abbr>   | <abbr title="Logprobs">logP</abbr>   | <abbr title="Prompt Logprobs">prmpt logP</abbr>   | <abbr title="Async Output Processing">async output</abbr>   | multi-step         | <abbr title="Multimodal Inputs">mm</abbr>   | best-of   | beam-search   |
-|-----------------------------------------------------------|-------------------------|-----------------------------------|------------------------|---------------------------------------------------|---------------------|--------------|-----------------------------------------------|-------------------------------------------------------|--------------------------------------|---------------------------------------------------|-------------------------------------------------------------|--------------------|---------------------------------------------|-----------|---------------|
-| [CP][chunked-prefill]                                     | ✅                       |                                   |                        |                                                   |                     |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
-| [APC][automatic-prefix-caching]                           | ✅                       | ✅                                 |                        |                                                   |                     |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
-| [LoRA][lora-adapter]                                      | ✅                       | ✅                                 | ✅                      |                                                   |                     |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
-| <abbr title="Prompt Adapter">prmpt adptr</abbr>           | ✅                       | ✅                                 | ✅                      | ✅                                                 |                     |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
-| [SD][spec-decode]                                         | ✅                       | ✅                                 | ❌                      | ✅                                                 | ✅                   |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
-| CUDA graph                                                | ✅                       | ✅                                 | ✅                      | ✅                                                 | ✅                   | ✅            |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
-| <abbr title="Pooling Models">pooling</abbr>               | ❌                       | ❌                                 | ❌                      | ❌                                                 | ❌                   | ❌            | ✅                                             |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ❌                       | [❌](gh-issue:7366)                | ❌                      | ❌                                                 | [❌](gh-issue:7366)  | ✅            | ✅                                             | ✅                                                     |                                      |                                                   |                                                             |                    |                                             |           |               |
-| <abbr title="Logprobs">logP</abbr>                        | ✅                       | ✅                                 | ✅                      | ✅                                                 | ✅                   | ✅            | ❌                                             | ✅                                                     | ✅                                    |                                                   |                                                             |                    |                                             |           |               |
-| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                       | ✅                                 | ✅                      | ✅                                                 | ✅                   | ✅            | ❌                                             | ✅                                                     | ✅                                    | ✅                                                 |                                                             |                    |                                             |           |               |
-| <abbr title="Async Output Processing">async output</abbr> | ✅                       | ✅                                 | ✅                      | ✅                                                 | ❌                   | ✅            | ❌                                             | ❌                                                     | ✅                                    | ✅                                                 | ✅                                                           |                    |                                             |           |               |
-| multi-step                                                | ❌                       | ✅                                 | ❌                      | ✅                                                 | ❌                   | ✅            | ❌                                             | ❌                                                     | ✅                                    | ✅                                                 | ✅                                                           | ✅                  |                                             |           |               |
-| <abbr title="Multimodal Inputs">mm</abbr>                 | ✅                       | [🟠](gh-pr:8348)                   | [🟠](gh-pr:4194)        | ❔                                                 | ❔                   | ✅            | ✅                                             | ✅                                                     | ✅                                    | ✅                                                 | ✅                                                           | ❔                  | ✅                                           |           |               |
-| best-of                                                   | ✅                       | ✅                                 | ✅                      | ✅                                                 | [❌](gh-issue:6137)  | ✅            | ❌                                             | ✅                                                     | ✅                                    | ✅                                                 | ❔                                                           | [❌](gh-issue:7968) | ✅                                           | ✅         |               |
-| beam-search                                               | ✅                       | ✅                                 | ✅                      | ✅                                                 | [❌](gh-issue:6137)  | ✅            | ❌                                             | ✅                                                     | ✅                                    | ✅                                                 | ❔                                                           | [❌](gh-issue:7968) | ❔                                           | ✅         | ✅             |
+| Feature | [CP][chunked-prefill] | [APC][automatic-prefix-caching] | [LoRA][lora-adapter] | <abbr title="Prompt Adapter">prmpt adptr</abbr> | [SD][spec-decode] | CUDA graph | <abbr title="Pooling Models">pooling</abbr> | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
+|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
+| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
+| [APC][automatic-prefix-caching] | ✅ | ✅ | | | | | | | | | | | | | |
+| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | | | | | | | | | | | | |
+| <abbr title="Prompt Adapter">prmpt adptr</abbr> | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | |
+| [SD][spec-decode] | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | |
+| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
+| <abbr title="Pooling Models">pooling</abbr> | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
+| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
+| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
+| multi-step | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
+| <abbr title="Multimodal Inputs">mm</abbr> | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
+| best-of | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | |
+| beam-search | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ |
 
 [](){ #feature-x-hardware }
 
-- 
GitLab


From e740d07f07d82983217077b89e23beaae134a30b Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 29 May 2025 17:51:36 +0800
Subject: [PATCH 041/274] [doc] add CLI doc (#18871)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/.nav.yml      |   3 +
 docs/cli/README.md | 179 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 182 insertions(+)
 create mode 100644 docs/cli/README.md

diff --git a/docs/.nav.yml b/docs/.nav.yml
index 42aba9775..a9c594c29 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -12,6 +12,7 @@ nav:
       - User Guide: usage/README.md
       - Developer Guide: contributing/README.md
       - API Reference: api/README.md
+      - CLI Reference: cli/README.md
     - Timeline:
       - Roadmap: https://roadmap.vllm.ai
       - Releases: https://github.com/vllm-project/vllm/releases
@@ -56,6 +57,8 @@ nav:
     - Contents:
       - glob: api/vllm/*
         preserve_directory_names: true
+  - CLI Reference:
+    - Summary: cli/README.md
   - Community:
     - community/*
     - Blog: https://blog.vllm.ai
diff --git a/docs/cli/README.md b/docs/cli/README.md
new file mode 100644
index 000000000..5feb316d6
--- /dev/null
+++ b/docs/cli/README.md
@@ -0,0 +1,179 @@
+# vLLM CLI Guide
+
+The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
+
+```
+vllm --help
+```
+
+Available Commands:
+
+```
+vllm {chat,complete,serve,bench,collect-env,run-batch}
+```
+
+## Table of Contents
+
+- [serve](#serve)
+- [chat](#chat)
+- [complete](#complete)
+- [bench](#bench)
+  - [latency](#latency)
+  - [serve](#serve-1)
+  - [throughput](#throughput)
+- [collect-env](#collect-env)
+- [run-batch](#run-batch)
+- [More Help](#more-help)
+
+## serve
+
+Start the vLLM OpenAI Compatible API server.
+
+Examples:
+
+```bash
+# Start with a model
+vllm serve meta-llama/Llama-2-7b-hf
+
+# Specify the port
+vllm serve meta-llama/Llama-2-7b-hf --port 8100
+
+# Check with --help for more options
+# To list all groups
+vllm serve --help=listgroup
+
+# To view a argument group
+vllm serve --help=ModelConfig
+
+# To view a single argument
+vllm serve --help=max-num-seqs
+
+# To search by keyword
+vllm serve --help=max
+```
+
+## chat
+
+Generate chat completions via the running API server.
+
+Examples:
+
+```bash
+# Directly connect to localhost API without arguments
+vllm chat
+
+# Specify API url
+vllm chat --url http://{vllm-serve-host}:{vllm-serve-port}/v1
+
+# Quick chat with a single prompt
+vllm chat --quick "hi"
+```
+
+## complete
+
+Generate text completions based on the given prompt via the running API server.
+
+Examples:
+
+```bash
+# Directly connect to localhost API without arguments
+vllm complete
+
+# Specify API url
+vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1
+
+# Quick complete with a single prompt
+vllm complete --quick "The future of AI is"
+```
+
+## bench
+
+Run benchmark tests for latency online serving throughput and offline inference throughput.
+
+Available Commands:
+
+```bash
+vllm bench {latency, serve, throughput}
+```
+
+### latency
+
+Benchmark the latency of a single batch of requests.
+
+Example:
+
+```bash
+vllm bench latency \
+    --model meta-llama/Llama-3.2-1B-Instruct \
+    --input-len 32 \
+    --output-len 1 \
+    --enforce-eager \
+    --load-format dummy
+```
+
+### serve
+
+Benchmark the online serving throughput.
+
+Example:
+
+```bash
+vllm bench serve \
+    --model meta-llama/Llama-3.2-1B-Instruct \
+    --host server-host \
+    --port server-port \
+    --random-input-len 32 \
+    --random-output-len 4  \
+    --num-prompts  5
+```
+
+### throughput
+
+Benchmark offline inference throughput.
+
+Example:
+
+```bash
+vllm bench throughput \
+    --model meta-llama/Llama-3.2-1B-Instruct \
+    --input-len 32 \
+    --output-len 1 \
+    --enforce-eager \
+    --load-format dummy
+```
+
+## collect-env
+
+Start collecting environment information.
+
+```bash
+vllm collect-env
+```
+
+## run-batch
+
+Run batch prompts and write results to file.
+
+Examples:
+
+```bash
+# Running with a local file
+vllm run-batch \
+    -i offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
+
+# Using remote file
+vllm run-batch \
+    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
+## More Help
+
+For detailed options of any subcommand, use:
+
+```bash
+vllm <subcommand> --help
+```
-- 
GitLab


From 7fcfd954ffbf947ea323abd364b2af8229aa0a93 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 29 May 2025 17:54:14 +0800
Subject: [PATCH 042/274] [Bugfix] Fix misleading information in the
 documentation (#18845)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/models/supported_models.md | 114 ++++++++++++++++----------------
 1 file changed, 57 insertions(+), 57 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 7594c6e6f..0202ba5a6 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -302,31 +302,31 @@ Specified using `--task generate`.
 | Architecture                                      | Models                                              | Example HF Models                                                                                                                                                            | [LoRA][lora-adapter]   | [PP][distributed-serving]   |
 |---------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|
 | `AquilaForCausalLM`                               | Aquila, Aquila2                                     | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          |
-| `ArcticForCausalLM`                               | Arctic                                              | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.                                                                                               | ✅︎                     |                             |
+| `ArcticForCausalLM`                               | Arctic                                              | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.                                                                                               |                        | ✅︎                          |
 | `BaiChuanForCausalLM`                             | Baichuan2, Baichuan                                 | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.                                                                                                          | ✅︎                     | ✅︎                          |
-| `BambaForCausalLM`                                | Bamba                                               | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B`                                                                                                                   |                        |                             |
-| `BloomForCausalLM`                                | BLOOM, BLOOMZ, BLOOMChat                            | `bigscience/bloom`, `bigscience/bloomz`, etc.                                                                                                                                | ✅︎                     |                             |
+| `BambaForCausalLM`                                | Bamba                                               | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B`                                                                                                                   | ✅︎                     | ✅︎                          |
+| `BloomForCausalLM`                                | BLOOM, BLOOMZ, BLOOMChat                            | `bigscience/bloom`, `bigscience/bloomz`, etc.                                                                                                                                |                        | ✅︎                          |
 | `BartForConditionalGeneration`                    | BART                                                | `facebook/bart-base`, `facebook/bart-large-cnn`, etc.                                                                                                                        |                        |                             |
 | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM                                             | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc.                                                                                                       | ✅︎                     | ✅︎                          |
 | `CohereForCausalLM`, `Cohere2ForCausalLM`         | Command-R                                           | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc.                                                                                               | ✅︎                     | ✅︎                          |
-| `DbrxForCausalLM`                                 | DBRX                                                | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.                                                                                                                     | ✅︎                     |                             |
-| `DeciLMForCausalLM`                               | DeciLM                                              | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc.                                                                                                                               | ✅︎                     |                             |
-| `DeepseekForCausalLM`                             | DeepSeek                                            | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.                                                                                                 | ✅︎                     |                             |
-| `DeepseekV2ForCausalLM`                           | DeepSeek-V2                                         | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.                                                                                                               | ✅︎                     |                             |
-| `DeepseekV3ForCausalLM`                           | DeepSeek-V3                                         | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.                                                                                                               | ✅︎                     |                             |
+| `DbrxForCausalLM`                                 | DBRX                                                | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.                                                                                                                     |                        | ✅︎                          |
+| `DeciLMForCausalLM`                               | DeciLM                                              | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc.                                                                                                                               | ✅︎                     | ✅︎                          |
+| `DeepseekForCausalLM`                             | DeepSeek                                            | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.                                                                                                 |                        | ✅︎                          |
+| `DeepseekV2ForCausalLM`                           | DeepSeek-V2                                         | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.                                                                                                               |                        | ✅︎                          |
+| `DeepseekV3ForCausalLM`                           | DeepSeek-V3                                         | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.                                                                                                               |                        | ✅︎                          |
 | `ExaoneForCausalLM`                               | EXAONE-3                                            | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          |
-| `FalconForCausalLM`                               | Falcon                                              | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.                                                                                                         | ✅︎                     |                             |
-| `FalconMambaForCausalLM`                          | FalconMamba                                         | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.                                                                                                            | ✅︎                     | ✅︎                          |
+| `FalconForCausalLM`                               | Falcon                                              | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.                                                                                                         |                        | ✅︎                          |
+| `FalconMambaForCausalLM`                          | FalconMamba                                         | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.                                                                                                            |                        | ✅︎                          |
 | `FalconH1ForCausalLM`                             | Falcon-H1                                           | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc.                                                                                                           | ✅︎                     | ✅︎                          |
 | `GemmaForCausalLM`                                | Gemma                                               | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc.                                                                                                                            | ✅︎                     | ✅︎                          |
 | `Gemma2ForCausalLM`                               | Gemma 2                                             | `google/gemma-2-9b`, `google/gemma-2-27b`, etc.                                                                                                                              | ✅︎                     | ✅︎                          |
 | `Gemma3ForCausalLM`                               | Gemma 3                                             | `google/gemma-3-1b-it`, etc.                                                                                                                                                 | ✅︎                     | ✅︎                          |
 | `GlmForCausalLM`                                  | GLM-4                                               | `THUDM/glm-4-9b-chat-hf`, etc.                                                                                                                                               | ✅︎                     | ✅︎                          |
 | `Glm4ForCausalLM`                                 | GLM-4-0414                                          | `THUDM/GLM-4-32B-0414`, etc.                                                                                                                                                 | ✅︎                     | ✅︎                          |
-| `GPT2LMHeadModel`                                 | GPT-2                                               | `gpt2`, `gpt2-xl`, etc.                                                                                                                                                      | ✅︎                     |                             |
+| `GPT2LMHeadModel`                                 | GPT-2                                               | `gpt2`, `gpt2-xl`, etc.                                                                                                                                                      |                        | ✅︎                          |
 | `GPTBigCodeForCausalLM`                           | StarCoder, SantaCoder, WizardCoder                  | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc.                                                                                 | ✅︎                     | ✅︎                          |
-| `GPTJForCausalLM`                                 | GPT-J                                               | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.                                                                                                                            | ✅︎                     |                             |
-| `GPTNeoXForCausalLM`                              | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | ✅︎                     |                             |
+| `GPTJForCausalLM`                                 | GPT-J                                               | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.                                                                                                                            |                        | ✅︎                          |
+| `GPTNeoXForCausalLM`                              | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. |                        | ✅︎                          |
 | `GraniteForCausalLM`                              | Granite 3.0, Granite 3.1, PowerLM                   | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc.                                                                             | ✅︎                     | ✅︎                          |
 | `GraniteMoeForCausalLM`                           | Granite 3.0 MoE, PowerMoE                           | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.                                                                | ✅︎                     | ✅︎                          |
 | `GraniteMoeHybridForCausalLM`                     | Granite 4.0 MoE Hybrid                              | `ibm-granite/granite-4.0-tiny-preview`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          |
@@ -336,39 +336,39 @@ Specified using `--task generate`.
 | `InternLMForCausalLM`                             | InternLM                                            | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.                                                                                                                    | ✅︎                     | ✅︎                          |
 | `InternLM2ForCausalLM`                            | InternLM2                                           | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.                                                                                                                  | ✅︎                     | ✅︎                          |
 | `InternLM3ForCausalLM`                            | InternLM3                                           | `internlm/internlm3-8b-instruct`, etc.                                                                                                                                       | ✅︎                     | ✅︎                          |
-| `JAISLMHeadModel`                                 | Jais                                                | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.                                                         | ✅︎                     |                             |
+| `JAISLMHeadModel`                                 | Jais                                                | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.                                                         |                        | ✅︎                          |
 | `JambaForCausalLM`                                | Jamba                                               | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc.                                                                                 | ✅︎                     | ✅︎                          |
 | `LlamaForCausalLM`                                | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi              | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc.        | ✅︎                     | ✅︎                          |
-| `MambaForCausalLM`                                | Mamba                                               | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.                                                                               | ✅︎                     |                             |
+| `MambaForCausalLM`                                | Mamba                                               | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.                                                                               |                        | ✅︎                          |
 | `MiniCPMForCausalLM`                              | MiniCPM                                             | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc.                                                                               | ✅︎                     | ✅︎                          |
 | `MiniCPM3ForCausalLM`                             | MiniCPM3                                            | `openbmb/MiniCPM3-4B`, etc.                                                                                                                                                  | ✅︎                     | ✅︎                          |
 | `MistralForCausalLM`                              | Mistral, Mistral-Instruct                           | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.                                                                                                      | ✅︎                     | ✅︎                          |
 | `MixtralForCausalLM`                              | Mixtral-8x7B, Mixtral-8x7B-Instruct                 | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.                                                          | ✅︎                     | ✅︎                          |
-| `MPTForCausalLM`                                  | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter        | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.                                                                                                   | ✅︎                     |                             |
+| `MPTForCausalLM`                                  | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter        | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.                                                                                                   |                        | ✅︎                          |
 | `NemotronForCausalLM`                             | Nemotron-3, Nemotron-4, Minitron                    | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.                                                                                                         | ✅︎                     | ✅︎                          |
-| `OLMoForCausalLM`                                 | OLMo                                                | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.                                                                                                                             | ✅︎                     |                             |
-| `OLMo2ForCausalLM`                                | OLMo2                                               | `allenai/OLMo-2-0425-1B`, etc.                                                                                                                                               | ✅︎                     |                             |
-| `OLMoEForCausalLM`                                | OLMoE                                               | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.                                                                                                        | ✅︎                     | ✅︎                          |
-| `OPTForCausalLM`                                  | OPT, OPT-IML                                        | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.                                                                                                                         | ✅︎                     |                             |
-| `OrionForCausalLM`                                | Orion                                               | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.                                                                                                             | ✅︎                     |                             |
+| `OLMoForCausalLM`                                 | OLMo                                                | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.                                                                                                                             |                        | ✅︎                          |
+| `OLMo2ForCausalLM`                                | OLMo2                                               | `allenai/OLMo-2-0425-1B`, etc.                                                                                                                                               |                        | ✅︎                          |
+| `OLMoEForCausalLM`                                | OLMoE                                               | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.                                                                                                        |                        | ✅︎                          |
+| `OPTForCausalLM`                                  | OPT, OPT-IML                                        | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.                                                                                                                         |                        | ✅︎                          |
+| `OrionForCausalLM`                                | Orion                                               | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.                                                                                                             |                        | ✅︎                          |
 | `PhiForCausalLM`                                  | Phi                                                 | `microsoft/phi-1_5`, `microsoft/phi-2`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          |
 | `Phi3ForCausalLM`                                 | Phi-4, Phi-3                                        | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.   | ✅︎                     | ✅︎                          |
-| `Phi3SmallForCausalLM`                            | Phi-3-Small                                         | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.                                                                                             | ✅︎                     |                             |
+| `Phi3SmallForCausalLM`                            | Phi-3-Small                                         | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.                                                                                             |                        | ✅︎                          |
 | `PhiMoEForCausalLM`                               | Phi-3.5-MoE                                         | `microsoft/Phi-3.5-MoE-instruct`, etc.                                                                                                                                       | ✅︎                     | ✅︎                          |
-| `PersimmonForCausalLM`                            | Persimmon                                           | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.                                                                                                                   | ✅︎                     |                             |
+| `PersimmonForCausalLM`                            | Persimmon                                           | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.                                                                                                                   |                        | ✅︎                          |
 | `Plamo2ForCausalLM`                               | PLaMo2                                              | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc.                                                                                                                                 |                        |                             |
 | `QWenLMHeadModel`                                 | Qwen                                                | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.                                                                                                                                    | ✅︎                     | ✅︎                          |
 | `Qwen2ForCausalLM`                                | QwQ, Qwen2                                          | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.                                                                                                      | ✅︎                     | ✅︎                          |
-| `Qwen2MoeForCausalLM`                             | Qwen2MoE                                            | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.                                                                                                                | ✅︎                     |                             |
+| `Qwen2MoeForCausalLM`                             | Qwen2MoE                                            | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.                                                                                                                |                        | ✅︎                          |
 | `Qwen3ForCausalLM`                                | Qwen3                                               | `Qwen/Qwen3-8B`, etc.                                                                                                                                                        | ✅︎                     | ✅︎                          |
-| `Qwen3MoeForCausalLM`                             | Qwen3MoE                                            | `Qwen/Qwen3-30B-A3B`, etc.                                                                                                                                                   | ✅︎                     |                             |
-| `StableLmForCausalLM`                             | StableLM                                            | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.                                                                                                | ✅︎                     |                             |
-| `Starcoder2ForCausalLM`                           | Starcoder2                                          | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.                                                                                             | ✅︎                     |                             |
+| `Qwen3MoeForCausalLM`                             | Qwen3MoE                                            | `Qwen/Qwen3-30B-A3B`, etc.                                                                                                                                                   |                        | ✅︎                          |
+| `StableLmForCausalLM`                             | StableLM                                            | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.                                                                                                |                        |                             |
+| `Starcoder2ForCausalLM`                           | Starcoder2                                          | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.                                                                                             |                        | ✅︎                          |
 | `SolarForCausalLM`                                | Solar Pro                                           | `upstage/solar-pro-preview-instruct`, etc.                                                                                                                                   | ✅︎                     | ✅︎                          |
 | `TeleChat2ForCausalLM`                            | TeleChat2                                           | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc.                                                                                                | ✅︎                     | ✅︎                          |
 | `TeleFLMForCausalLM`                              | TeleFLM                                             | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc.                                                                                                                    | ✅︎                     | ✅︎                          |
 | `XverseForCausalLM`                               | XVERSE                                              | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.                                                                                            | ✅︎                     | ✅︎                          |
-| `MiniMaxText01ForCausalLM`                        | MiniMax-Text                                        | `MiniMaxAI/MiniMax-Text-01`, etc.                                                                                                                                            | ✅︎                     |                             |
+| `MiniMaxText01ForCausalLM`                        | MiniMax-Text                                        | `MiniMaxAI/MiniMax-Text-01`, etc.                                                                                                                                            |                        |                             |
 | `Zamba2ForCausalLM`                               | Zamba2                                              | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.                                                                              |                        |                             |
 
 !!! note
@@ -512,44 +512,44 @@ Specified using `--task generate`.
 
 | Architecture                                 | Models                                                                   | Inputs                                                                | Example HF Models                                                                                                                                       | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
 |----------------------------------------------|--------------------------------------------------------------------------|-----------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------|
-| `AriaForConditionalGeneration`               | Aria                                                                     | T + I<sup>+</sup>                                                     | `rhymes-ai/Aria`                                                                                                                                        | ✅︎                     | ✅︎                          |                       |
-| `AyaVisionForConditionalGeneration`          | Aya Vision                                                               | T + I<sup>+</sup>                                                     | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc.                                                                                         | ✅︎                     | ✅︎                          |                       |
-| `Blip2ForConditionalGeneration`              | BLIP-2                                                                   | T + I<sup>E</sup>                                                     | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.                                                                                          | ✅︎                     | ✅︎                          |                       |
-| `ChameleonForConditionalGeneration`          | Chameleon                                                                | T + I                                                                 | `facebook/chameleon-7b` etc.                                                                                                                            | ✅︎                     | ✅︎                          |                       |
-| `DeepseekVLV2ForCausalLM`<sup>^</sup>        | DeepSeek-VL2                                                             | T + I<sup>+</sup>                                                     | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc.                                                      | ✅︎                     | ✅︎                          |                       |
+| `AriaForConditionalGeneration`               | Aria                                                                     | T + I<sup>+</sup>                                                     | `rhymes-ai/Aria`                                                                                                                                        |                        |                            | ✅︎                       |
+| `AyaVisionForConditionalGeneration`          | Aya Vision                                                               | T + I<sup>+</sup>                                                     | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc.                                                                                         |                        | ✅︎                          | ✅︎                       |
+| `Blip2ForConditionalGeneration`              | BLIP-2                                                                   | T + I<sup>E</sup>                                                     | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.                                                                                          |                        | ✅︎                          | ✅︎                      |
+| `ChameleonForConditionalGeneration`          | Chameleon                                                                | T + I                                                                 | `facebook/chameleon-7b` etc.                                                                                                                            |                      | ✅︎                            | ✅︎                      |
+| `DeepseekVLV2ForCausalLM`<sup>^</sup>        | DeepSeek-VL2                                                             | T + I<sup>+</sup>                                                     | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc.                                                      |                       | ✅︎                           | ✅︎                      |
 | `Florence2ForConditionalGeneration`          | Florence-2                                                               | T + I                                                                 | `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc.                                                                                          |                        |                             |                       |
-| `FuyuForCausalLM`                            | Fuyu                                                                     | T + I                                                                 | `adept/fuyu-8b` etc.                                                                                                                                    | ✅︎                     | ✅︎                          |                       |
+| `FuyuForCausalLM`                            | Fuyu                                                                     | T + I                                                                 | `adept/fuyu-8b` etc.                                                                                                                                    |                       | ✅︎                           | ✅︎                     |
 | `Gemma3ForConditionalGeneration`             | Gemma 3                                                                  | T + I<sup>+</sup>                                                     | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.                                                                                                   | ✅︎                     | ✅︎                          | ⚠️                    |
 | `GLM4VForCausalLM`<sup>^</sup>               | GLM-4V                                                                   | T + I                                                                 | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc.                                                                                                    | ✅︎                     | ✅︎                          | ✅︎                    |
 | `GraniteSpeechForConditionalGeneration`      | Granite Speech                                                           | T + A                                                                 | `ibm-granite/granite-speech-3.3-8b`                                                                                                                     | ✅︎                     | ✅︎                          | ✅︎                    |
-| `H2OVLChatModel`                             | H2OVL                                                                    | T + I<sup>E+</sup>                                                    | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.                                                                                      | ✅︎                     | ✅︎\*                        |                       |
-| `Idefics3ForConditionalGeneration`           | Idefics3                                                                 | T + I                                                                 | `HuggingFaceM4/Idefics3-8B-Llama3` etc.                                                                                                                 | ✅︎                     | ✅︎                          |                       |
-| `InternVLChatModel`                          | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>)                                                    | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.  | ✅︎                     | ✅︎                          |                       |
-| `KimiVLForConditionalGeneration`             | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking                               | T + I<sup>+</sup>                                                     | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking`                                                                                    | ✅︎                     |                             |                       |
-| `Llama4ForConditionalGeneration`             | Llama 4                                                                  | T + I<sup>+</sup>                                                     | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎                     | ✅︎                          |                       |
-| `LlavaForConditionalGeneration`              | LLaVA-1.5                                                                | T + I<sup>E+</sup>                                                    | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.                                                                        | ✅︎                     | ✅︎                          |                       |
-| `LlavaNextForConditionalGeneration`          | LLaVA-NeXT                                                               | T + I<sup>E+</sup>                                                    | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.                                                                           | ✅︎                     | ✅︎                          |                       |
-| `LlavaNextVideoForConditionalGeneration`     | LLaVA-NeXT-Video                                                         | T + V                                                                 | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.                                                                                                                 | ✅︎                     | ✅︎                          |                       |
-| `LlavaOnevisionForConditionalGeneration`     | LLaVA-Onevision                                                          | T + I<sup>+</sup> + V<sup>+</sup>                                     | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.                                                            | ✅︎                     | ✅︎                          |                       |
+| `H2OVLChatModel`                             | H2OVL                                                                    | T + I<sup>E+</sup>                                                    | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.                                                                                      |                       | ✅︎                          | ✅︎\*                     |
+| `Idefics3ForConditionalGeneration`           | Idefics3                                                                 | T + I                                                                 | `HuggingFaceM4/Idefics3-8B-Llama3` etc.                                                                                                                 | ✅︎                     |                           |  ✅︎                     |
+| `InternVLChatModel`                          | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>)                                                    | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.  |                      | ✅︎                          | ✅︎                      |
+| `KimiVLForConditionalGeneration`             | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking                               | T + I<sup>+</sup>                                                     | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking`                                                                                    |                      |                             | ✅︎                       |
+| `Llama4ForConditionalGeneration`             | Llama 4                                                                  | T + I<sup>+</sup>                                                     | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. |                       | ✅︎                          | ✅︎                      |
+| `LlavaForConditionalGeneration`              | LLaVA-1.5                                                                | T + I<sup>E+</sup>                                                    | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.                                                                        |                       | ✅︎                          | ✅︎                      |
+| `LlavaNextForConditionalGeneration`          | LLaVA-NeXT                                                               | T + I<sup>E+</sup>                                                    | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.                                                                           |                       | ✅︎                          | ✅︎                      |
+| `LlavaNextVideoForConditionalGeneration`     | LLaVA-NeXT-Video                                                         | T + V                                                                 | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.                                                                                                                 |                       | ✅︎                          | ✅︎                      |
+| `LlavaOnevisionForConditionalGeneration`     | LLaVA-Onevision                                                          | T + I<sup>+</sup> + V<sup>+</sup>                                     | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.                                                            |                       | ✅︎                          | ✅︎                      |
 | `MiniCPMO`                                   | MiniCPM-O                                                                | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup>                  | `openbmb/MiniCPM-o-2_6`, etc.                                                                                                                           | ✅︎                     | ✅︎                          | ✅︎                    |
-| `MiniCPMV`                                   | MiniCPM-V                                                                | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.                                                         | ✅︎                     | ✅︎                          | ✅︎                    |
-| `MiniMaxVL01ForConditionalGeneration`        | MiniMax-VL                                                               | T + I<sup>E+</sup>                                                    | `MiniMaxAI/MiniMax-VL-01`, etc.                                                                                                                         | ✅︎                     | ✅︎                          |                       |
+| `MiniCPMV`                                   | MiniCPM-V                                                                | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.                                                         | ✅︎                     |                            | ✅︎                    |
+| `MiniMaxVL01ForConditionalGeneration`        | MiniMax-VL                                                               | T + I<sup>E+</sup>                                                    | `MiniMaxAI/MiniMax-VL-01`, etc.                                                                                                                         |                       | ✅︎                          |                       |
 | `Mistral3ForConditionalGeneration`           | Mistral3                                                                 | T + I<sup>+</sup>                                                     | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc.                                                                                                   | ✅︎                     | ✅︎                          | ✅︎                    |
-| `MllamaForConditionalGeneration`             | Llama 3.2                                                                | T + I<sup>+</sup>                                                     | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.                                                                     |                        |                             |                       |
+| `MllamaForConditionalGeneration`             | Llama 3.2                                                                | T + I<sup>+</sup>                                                     | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.                                                                     |                        |                             |                        |
 | `MolmoForCausalLM`                           | Molmo                                                                    | T + I<sup>+</sup>                                                     | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc.                                                                                              | ✅︎                     | ✅︎                          | ✅︎                    |
-| `NVLM_D_Model`                               | NVLM-D 1.0                                                               | T + I<sup>+</sup>                                                     | `nvidia/NVLM-D-72B`, etc.                                                                                                                               | ✅︎                     | ✅︎                          |                       |
-| `Ovis`                                       | Ovis2, Ovis1.6                                                           | T + I<sup>+</sup>                                                     | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc.                                                                                                 | ✅︎                     |                             |                       |
-| `PaliGemmaForConditionalGeneration`          | PaliGemma, PaliGemma 2                                                   | T + I<sup>E</sup>                                                     | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.                                                  | ✅︎                     | ⚠️                          |                       |
-| `Phi3VForCausalLM`                           | Phi-3-Vision, Phi-3.5-Vision                                             | T + I<sup>E+</sup>                                                    | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.                                                                       | ✅︎                     | ✅︎                          |                       |
-| `Phi4MMForCausalLM`                          | Phi-4-multimodal                                                         | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc.                                                                                                             | ✅︎                     | ✅︎                          |                       |
-| `PixtralForConditionalGeneration`            | Pixtral                                                                  | T + I<sup>+</sup>                                                     | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc.                                                                  | ✅︎                     | ✅︎                          |                       |
+| `NVLM_D_Model`                               | NVLM-D 1.0                                                               | T + I<sup>+</sup>                                                     | `nvidia/NVLM-D-72B`, etc.                                                                                                                               |                       | ✅︎                          | ✅︎                      |
+| `Ovis`                                       | Ovis2, Ovis1.6                                                           | T + I<sup>+</sup>                                                     | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc.                                                                                                 |                       |                             | ✅︎                      |
+| `PaliGemmaForConditionalGeneration`          | PaliGemma, PaliGemma 2                                                   | T + I<sup>E</sup>                                                     | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.                                                  |                       | ✅︎                          | ⚠️                       |
+| `Phi3VForCausalLM`                           | Phi-3-Vision, Phi-3.5-Vision                                             | T + I<sup>E+</sup>                                                    | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.                                                                       |                      | ✅︎                          |  ✅︎                     |
+| `Phi4MMForCausalLM`                          | Phi-4-multimodal                                                         | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc.                                                                                                             | ✅︎                     | ✅︎                          | ✅︎                       |
+| `PixtralForConditionalGeneration`            | Pixtral                                                                  | T + I<sup>+</sup>                                                     | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc.                                                                  |                       | ✅︎                          | ✅︎                      |
 | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL                                                                  | T + I<sup>E+</sup>                                                    | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.                                                                                                               | ✅︎                     | ✅︎                          | ✅︎                    |
-| `Qwen2AudioForConditionalGeneration`         | Qwen2-Audio                                                              | T + A<sup>+</sup>                                                     | `Qwen/Qwen2-Audio-7B-Instruct`                                                                                                                          | ✅︎                     | ✅︎                          |                       |
+| `Qwen2AudioForConditionalGeneration`         | Qwen2-Audio                                                              | T + A<sup>+</sup>                                                     | `Qwen/Qwen2-Audio-7B-Instruct`                                                                                                                          |                       | ✅︎                          | ✅︎                      |
 | `Qwen2VLForConditionalGeneration`            | QVQ, Qwen2-VL                                                            | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.                                                                 | ✅︎                     | ✅︎                          | ✅︎                    |
 | `Qwen2_5_VLForConditionalGeneration`         | Qwen2.5-VL                                                               | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc.                                                                                     | ✅︎                     | ✅︎                          | ✅︎                    |
-| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni                                                             | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup>                   | `Qwen/Qwen2.5-Omni-7B`                                                                                                                                  | ✅︎                     | ✅︎\*                        |                       |
-| `SkyworkR1VChatModel`                        | Skywork-R1V-38B                                                          | T + I                                                                 | `Skywork/Skywork-R1V-38B`                                                                                                                               | ✅︎                     | ✅︎                          |                       |
-| `SmolVLMForConditionalGeneration`            | SmolVLM2                                                                 | T + I                                                                 | `SmolVLM2-2.2B-Instruct`                                                                                                                                | ✅︎                     | ✅︎                          |                       |
+| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni                                                             | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup>                   | `Qwen/Qwen2.5-Omni-7B`                                                                                                                                  |                      | ✅︎                         | ✅︎\*                     |
+| `SkyworkR1VChatModel`                        | Skywork-R1V-38B                                                          | T + I                                                                 | `Skywork/Skywork-R1V-38B`                                                                                                                               |                       | ✅︎                          | ✅︎                      |
+| `SmolVLMForConditionalGeneration`            | SmolVLM2                                                                 | T + I                                                                 | `SmolVLM2-2.2B-Instruct`                                                                                                                                | ✅︎                     |                           | ✅︎                      |
 
 <sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.  
 &nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:  
@@ -647,7 +647,7 @@ The following table lists those that are tested in vLLM.
 
 | Architecture                        | Models             | Inputs   | Example HF Models        | [LoRA][lora-adapter]   | [PP][distributed-serving]   |
 |-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|
-| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based   | T / I    | `royokong/e5-v`          | ✅︎                     |                             |
+| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based   | T / I    | `royokong/e5-v`          |                       |                             |
 | `Phi3VForCausalLM`                  | Phi-3-Vision-based | T + I    | `TIGER-Lab/VLM2Vec-Full` | 🚧                      | ✅︎                          |
 
 #### Transcription
-- 
GitLab


From 24d0ef89705e0ab8df3d79fcbfd669cf5575772b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 29 May 2025 11:58:14 +0200
Subject: [PATCH 043/274] [Misc] Replace TODO in serving transcription (#18895)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm/entrypoints/openai/serving_transcription.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
index 13565d0ef..9fc5b562e 100644
--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -278,7 +278,9 @@ class OpenAIServingTranscription(OpenAIServing):
 
         result_generator: Optional[AsyncGenerator[RequestOutput, None]] = None
         try:
-            # TODO(rob): subtract len of tokenized prompt.
+            # Unlike most decoder-only models, whisper generation length is not
+            # constrained by the size of the input audio, which is mapped to a
+            # fixed-size log-mel-spectogram.
             default_max_tokens = self.model_config.max_model_len
             sampling_params = request.to_sampling_params(
                 default_max_tokens, self.default_sampling_params)
-- 
GitLab


From 0b1447f890087c7610b1855ae12de023a26ddc7f Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Thu, 29 May 2025 11:05:20 +0100
Subject: [PATCH 044/274] [Bugfix] Ensure tensors are contiguous during
 serialisation (#18860)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 vllm/v1/serial_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index fbd38fc47..78f37c1e8 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -158,8 +158,8 @@ class MsgpackEncoder:
         self, obj: torch.Tensor
     ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
         assert self.aux_buffers is not None
-        #  view the tensor as a 1D array of bytes
-        arr = obj.flatten().view(torch.uint8).numpy()
+        # view the tensor as a contiguous 1D array of bytes
+        arr = obj.flatten().contiguous().view(torch.uint8).numpy()
         if obj.nbytes < self.size_threshold:
             # Smaller tensors are encoded inline, just like ndarrays.
             data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)
-- 
GitLab


From f274581f44c26bcec8bcdf6d4cdfd92a4310f995 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Thu, 29 May 2025 06:05:46 -0400
Subject: [PATCH 045/274] [BugFix] Update pydantic to fix error on python 3.10
 (#18852)

Signed-off-by: luka <luka@neuralmagic.com>
---
 requirements/common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 625efc336..de4b3b531 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -14,7 +14,7 @@ protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
 openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
-pydantic >= 2.9
+pydantic >= 2.10
 prometheus_client >= 0.18.0
 pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
-- 
GitLab


From f8977c233f453313c124dc0753dfc1669ef401f0 Mon Sep 17 00:00:00 2001
From: Chenyaaang <42742451+Chenyaaang@users.noreply.github.com>
Date: Thu, 29 May 2025 03:07:20 -0700
Subject: [PATCH 046/274] Fix an error in dummy weight loading for quantization
 models (#18855)

Signed-off-by: Chenyaaang <chenyangli@google.com>
---
 vllm/model_executor/model_loader/weight_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index f61956f4e..7a9a68be8 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -696,7 +696,7 @@ def initialize_dummy_weights(
                 # Note: We avoid using torch.rank_like as it doesn't currently
                 # support the generator argument.
                 param.copy_((high - low) *
-                            torch.rand(*param.shape,
+                            torch.rand(param.shape,
                                        generator=generator,
                                        dtype=param.dtype,
                                        layout=param.layout,
-- 
GitLab


From b169d5f7b6ad65586c5352030bda154d589a4d89 Mon Sep 17 00:00:00 2001
From: Duyi-Wang <duyi.wang@intel.com>
Date: Thu, 29 May 2025 20:02:08 +0800
Subject: [PATCH 047/274] [Misc][Tools][Benchmark] Add benchmark_serving
 supports for llama.cpp.  (#18692)

Signed-off-by: Duyi-Wang <duyi.wang@intel.com>
---
 benchmarks/backend_request_func.py | 3 ++-
 benchmarks/benchmark_serving.py    | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 88616e110..85e6eda7f 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -324,7 +324,7 @@ async def async_request_openai_completions(
 
                                 most_recent_timestamp = timestamp
                                 generated_text += text or ""
-                            elif usage := data.get("usage"):
+                            if usage := data.get("usage"):
                                 output.output_tokens = usage.get("completion_tokens")
                     if first_chunk_received:
                         output.success = True
@@ -611,6 +611,7 @@ ASYNC_REQUEST_FUNCS = {
     "tensorrt-llm": async_request_trt_llm,
     "scalellm": async_request_openai_completions,
     "sglang": async_request_openai_completions,
+    "llama.cpp": async_request_openai_completions,
 }
 
 OPENAI_COMPATIBLE_BACKENDS = [
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index a887e7150..79024a9d6 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -762,6 +762,10 @@ def main(args: argparse.Namespace):
     if "temperature" not in sampling_params:
         sampling_params["temperature"] = 0.0  # Default to greedy decoding.
 
+    if args.backend == "llama.cpp":
+        # Disable prompt caching in llama.cpp backend
+        sampling_params["cache_prompt"] = False
+
     # Avoid GC processing "static" data - reduce pause times.
     gc.collect()
     gc.freeze()
-- 
GitLab


From 6f2909405edbd39478891f71dc2bfcbcda8530fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?=
 <ohg3417@gmail.com>
Date: Thu, 29 May 2025 23:38:55 +0900
Subject: [PATCH 048/274] [Doc]  Fix codeblocks formatting in LoRA adapters
 documentation (#18907)

Signed-off-by: Zerohertz <ohg3417@gmail.com>
---
 docs/features/lora.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/features/lora.md b/docs/features/lora.md
index 642462f7c..04e92dbc4 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -165,6 +165,7 @@ it will first look in the local directory for a directory `foobar`, and attempt
 that adapter will then be available for normal use on the server.
 
 Alternatively, follow these example steps to implement your own plugin:
+
 1. Implement the LoRAResolver interface.
 
     Example of a simple S3 LoRAResolver implementation:
@@ -198,9 +199,9 @@ Alternatively, follow these example steps to implement your own plugin:
             return lora_request
     ```
 
-2. Register LoRAResolver plugin.
+2. Register `LoRAResolver` plugin.
 
-     ```python
+    ```python
     from vllm.lora.resolver import LoRAResolverRegistry
 
     s3_resolver = S3LoRAResolver()
-- 
GitLab


From c9479b292086b9ae0623f619af15cf54d58842c1 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 29 May 2025 22:39:25 +0800
Subject: [PATCH 049/274] [Bugfix] Fix the failing gte embedding test (#18720)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 tests/conftest.py                              | 13 +++++++------
 .../models/language/pooling/test_embedding.py  | 18 +++++++++++-------
 tests/models/language/pooling/test_gte.py      |  1 +
 tests/models/utils.py                          |  1 +
 4 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 19c2c6247..26674483f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -311,6 +311,7 @@ class HfRunner:
         dtype: str = "auto",
         *,
         model_kwargs: Optional[dict[str, Any]] = None,
+        trust_remote_code: bool = True,
         is_sentence_transformer: bool = False,
         is_cross_encoder: bool = False,
         skip_tokenizer_init: bool = False,
@@ -320,7 +321,7 @@ class HfRunner:
 
         self.config = AutoConfig.from_pretrained(
             model_name,
-            trust_remote_code=True,
+            trust_remote_code=trust_remote_code,
         )
         self.device = self.get_default_device()
         self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
@@ -336,7 +337,7 @@ class HfRunner:
                 model_name,
                 device=self.device,
                 model_kwargs=model_kwargs,
-                trust_remote_code=True,
+                trust_remote_code=trust_remote_code,
             )
         elif is_cross_encoder:
             # Lazy init required for AMD CI
@@ -346,12 +347,12 @@ class HfRunner:
                 model_name,
                 device=self.device,
                 automodel_args=model_kwargs,
-                trust_remote_code=True,
+                trust_remote_code=trust_remote_code,
             )
         else:
             model = auto_cls.from_pretrained(
                 model_name,
-                trust_remote_code=True,
+                trust_remote_code=trust_remote_code,
                 **model_kwargs,
             )
 
@@ -372,7 +373,7 @@ class HfRunner:
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model_name,
                 torch_dtype=torch_dtype,
-                trust_remote_code=True,
+                trust_remote_code=trust_remote_code,
             )
 
         # don't put this import at the top level
@@ -381,7 +382,7 @@ class HfRunner:
         self.processor = AutoProcessor.from_pretrained(
             model_name,
             torch_dtype=torch_dtype,
-            trust_remote_code=True,
+            trust_remote_code=trust_remote_code,
         )
         if skip_tokenizer_init:
             self.tokenizer = self.processor.tokenizer
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index a44b2154b..306cfdf37 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -10,18 +10,22 @@ from ...utils import check_embeddings_close
 @pytest.mark.parametrize(
     "model",
     [
-        # [Encoder-only]
-        pytest.param("BAAI/bge-base-en-v1.5",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
-        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
-        pytest.param("intfloat/multilingual-e5-small"),
-        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
+        # Be careful of the order of models, decoder-only models should be
+        # placed before encoder-only models, otherwise `Qwen2.5-0.5B-Instruct`
+        # case won't pass because gte-Qwen2-1.5B-instruct will cache custom
+        # model code with bidirectional attention.
         # [Decoder-only]
         pytest.param("BAAI/bge-multilingual-gemma2",
                      marks=[pytest.mark.core_model]),
         pytest.param("intfloat/e5-mistral-7b-instruct",
                      marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
+        # [Encoder-only]
+        pytest.param("BAAI/bge-base-en-v1.5",
+                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
+        pytest.param("intfloat/multilingual-e5-small"),
+        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
         # [Cross-Encoder]
         pytest.param("sentence-transformers/stsb-roberta-base-v2"),
     ],
@@ -44,7 +48,7 @@ def test_models(
     vllm_extra_kwargs = {}
     if model == "ssmits/Qwen2-7B-Instruct-embed-base":
         vllm_extra_kwargs["override_pooler_config"] = \
-            PoolerConfig(pooling_type="MEAN")
+            PoolerConfig(pooling_type="MEAN", normalize=False)
 
     # The example_prompts has ending "\n", for example:
     # "Write a short story about a robot that dreams for the first time.\n"
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index 18b27a688..725e3d168 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -45,6 +45,7 @@ MODELS = [
     ########### Qwen2ForCausalLM
     EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
                    architecture="Qwen2ForCausalLM",
+                   dtype="float32",
                    enable_test=True),
     ########## ModernBertModel
     EmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
diff --git a/tests/models/utils.py b/tests/models/utils.py
index ac1fc6c8f..ffc904bd1 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -314,6 +314,7 @@ def check_embeddings_close(
                                   dim=0)
 
         fail_msg = (f"Test{prompt_idx}:"
+                    f"\nCosine similarity: \t{sim:.4f}"
                     f"\n{name_0}:\t{embeddings_0[:16]!r}"
                     f"\n{name_1}:\t{embeddings_1[:16]!r}")
 
-- 
GitLab


From da4b69d0b435e74c0a208cffbf2f430f5529ec64 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Thu, 29 May 2025 10:48:24 -0400
Subject: [PATCH 050/274] [Attention][V1] Toggle for v1 attention backend
 (#18275)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 vllm/attention/ops/chunked_prefill_paged_decode.py |  4 ++--
 vllm/attention/ops/prefix_prefill.py               |  4 ++--
 vllm/envs.py                                       | 12 ++++++++++--
 vllm/v1/attention/backends/triton_attn.py          |  9 ++++++---
 4 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index 785799b6b..6ca2a6414 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -264,8 +264,8 @@ def chunked_prefill_paged_decode(
     # Conversion of FP8 Tensor from uint8 storage to
     # appropriate torch.dtype for interpretation by Triton
     if "fp8" in kv_cache_dtype:
-        assert key_cache.dtype == torch.uint8
-        assert value_cache.dtype == torch.uint8
+        assert key_cache.dtype in [torch.uint8, current_platform.fp8_dtype()]
+        assert value_cache.dtype in [torch.uint8, current_platform.fp8_dtype()]
 
         if kv_cache_dtype in ("fp8", "fp8_e4m3"):
             target_dtype = current_platform.fp8_dtype()
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 86d256b63..729b61b02 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -744,8 +744,8 @@ def context_attention_fwd(q,
     # Conversion of FP8 Tensor from uint8 storage to
     # appropriate torch.dtype for interpretation by Triton
     if "fp8" in kv_cache_dtype:
-        assert (k_cache.dtype == torch.uint8)
-        assert (v_cache.dtype == torch.uint8)
+        assert k_cache.dtype in [torch.uint8, current_platform.fp8_dtype()]
+        assert v_cache.dtype in [torch.uint8, current_platform.fp8_dtype()]
 
         if kv_cache_dtype in ("fp8", "fp8_e4m3"):
             target_dtype = current_platform.fp8_dtype()
diff --git a/vllm/envs.py b/vllm/envs.py
index b007bf8c5..bd9104afa 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -15,6 +15,7 @@ if TYPE_CHECKING:
     VLLM_NCCL_SO_PATH: Optional[str] = None
     LD_LIBRARY_PATH: Optional[str] = None
     VLLM_USE_TRITON_FLASH_ATTN: bool = False
+    VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
     VLLM_FLASH_ATTN_VERSION: Optional[int] = None
     LOCAL_RANK: int = 0
     CUDA_VISIBLE_DEVICES: Optional[str] = None
@@ -290,6 +291,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
              ("true", "1")),
 
+    # Use separate prefill and decode kernels for V1 attention instead of
+    # the unified triton kernel.
+    "VLLM_V1_USE_PREFILL_DECODE_ATTENTION":
+    lambda:
+    (os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() in
+     ("true", "1")),
+
     # Force vllm to use a specific flash-attention version (2 or 3), only valid
     # when using the flash-attention backend.
     "VLLM_FLASH_ATTN_VERSION":
@@ -323,8 +331,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
 
     # Whether to log responses from API Server for debugging
     "VLLM_DEBUG_LOG_API_SERVER_RESPONSE":
-    lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False").
-    lower() == "true",
+    lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False"
+                           ).lower() == "true",
 
     # S3 access information, used for tensorizer to load model from S3
     "S3_ACCESS_KEY_ID":
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 4000f9398..a97bb8500 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Any, Optional
 import torch
 
 from vllm import _custom_ops as ops
+from vllm import envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.ops.chunked_prefill_paged_decode import (
@@ -126,6 +127,8 @@ class TritonAttentionImpl(AttentionImpl):
                                       "TritonAttentionImpl")
 
         self.fp8_dtype = current_platform.fp8_dtype()
+        self.force_prefill_decode_attn = \
+            envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
 
     def forward(
         self,
@@ -166,9 +169,9 @@ class TritonAttentionImpl(AttentionImpl):
         # performance to make sure it does not introduce any overhead.
 
         num_queries_per_kv = query.shape[1] // key.shape[1]
-        use_prefill_decode_attn = (num_queries_per_kv &
-                                   (num_queries_per_kv - 1)) != 0
-
+        num_q_is_pow2 = (num_queries_per_kv & (num_queries_per_kv - 1)) == 0
+        use_prefill_decode_attn = (self.force_prefill_decode_attn
+                                   or not num_q_is_pow2)
         num_actual_tokens = attn_metadata.num_actual_tokens
 
         if use_prefill_decode_attn:
-- 
GitLab


From 1b7cfd5a367b8aa60ce6a9acd058047a3b20c797 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Thu, 29 May 2025 12:13:18 -0400
Subject: [PATCH 051/274] [ROCm][V0][Attention] Revert to the previous FA
 triton kernel (#18226)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 vllm/attention/backends/rocm_flash_attn.py   |    5 +-
 vllm/attention/ops/triton_flash_attention.py | 1766 +++++++-----------
 vllm/platforms/rocm.py                       |    6 +
 3 files changed, 694 insertions(+), 1083 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index abcb68911..7134472da 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -770,8 +770,9 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                                       and layer._v_scale and layer._prob_scale
                                       and self.kv_cache_dtype == "fp8")
                     full_scales = (
-                        layer._q_scale, layer._k_scale, layer._v_scale,
-                        layer._prob_scale) if use_fp8_scales else None
+                        layer._q_scale.item(), layer._k_scale.item(),
+                        layer._v_scale.item(),
+                        layer._prob_scale.item()) if use_fp8_scales else None
                     self.triton_attn_func(
                         query,
                         key,
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index 8940d0b66..62cfb813d 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -1,236 +1,33 @@
+#!/usr/bin/env python
 # SPDX-License-Identifier: Apache-2.0
 """
 Fused Attention
 ===============
 
-This is a Triton implementation of the Flash Attention v2 algorithm
-See https://tridao.me/publications/flash2/flash2.pdf
+This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao
+(https://tridao.me/publications/flash2/flash2.pdf)
+Credits: OpenAI kernel team, AMD ML Frameworks Triton team
 
-Credits:
-AMD Triton kernels team
-OpenAI kernel team
-
-Currently only the forward kernel is supported, and contains these features:
+Features supported:
 
 1) Fwd with causal masking
-2) Arbitrary Q and KV sequence lengths
-3) Arbitrary head sizes
-4) Multi and grouped query attention
-5) Variable sequence lengths
-6) ALiBi and matrix bias
-7) FP8 support
+2) Any sequence lengths without padding (currently fwd kernel only)
+3) Support for different sequence lengths for q and k
+4) Nested tensor API currently does not support dropout or bias.
 
-"""
+Not currently supported:
 
-from typing import Optional
+1) Non power of two head dims
+
+"""
 
 import torch
 
-from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
+from vllm.platforms.rocm import on_gfx1x
 from vllm.triton_utils import tl, triton
 
-SUPPORTED_LAYOUTS = ['thd', 'bhsd', 'bshd']
-
-default_eight_bit_dtype_triton = tl.float8e4b8
-default_eight_bit_dtype_torch = current_platform.fp8_dtype()
-default_float8_info = torch.finfo(default_eight_bit_dtype_torch)
-
-FP8_MIN = triton.language.constexpr(default_float8_info.min)
-
-# According to https://github.com/vllm-project/vllm/blob/main
-#              /csrc/quantization/utils.cuh#L31,
-# need to make the max for the uz datatype be 224.0 for accuracy reasons.
-FP8_MAX = triton.language.constexpr(
-    default_float8_info.max if default_eight_bit_dtype_torch !=
-    torch.float8_e4m3fnuz else 224.0)
-
-
-class MetaData:
-    cu_seqlens_q = None
-    cu_seqlens_k = None
-    max_seqlens_q = 0
-    max_seqlens_k = 0
-    bias = None
-    alibi_slopes = None
-    causal = False
-    num_contexts = 0
-    varlen = False
-    eight_bit = False
-    layout = None
-    return_encoded_softmax = False
-    eight_bit_dtype_triton = default_eight_bit_dtype_triton
-    eight_bit_dtype_torch = default_eight_bit_dtype_torch
-    output_dtype = None
-
-    # Note about layouts:
-    #
-    # thd - [num_tokens, num_heads, head_size]
-    # bshd - [batch_size, seq_len, num_heads, head_size]
-    # bhsd - [batch_size, num_heads, seq_len, head_size]
-    #
-    # This is for each tensor, all tensors must have same layout.
-    # Q can have num_heads and seq_len differ from  from K and V,
-    # however K and V must agree on this.
-    #
-    # Notes about varlen and bias:
-    # Only one or the other is implemented, meaning can't combine
-    # both varlen and bias right now.
-    #
-    # Note about quantization:
-    # Only 8-bit quantization supported (for now) and specifically fp8.
-    # Scales must be tensors.
-    # o_scale: This is 'output scaling', but comes from parameter called
-    # 'input_scale', this is applied to the output from the kernel.
-    # o_scale should be None if none of the other quantization parameters
-    # are used.
-    #
-    # NOTE: Object is in a tentatively good state after initialized, however,
-    # to verify, call check_args(q,k,v,o) where o is the output tensor.
-    def __init__(
-        self,
-        sm_scale=1.0,
-        layout=None,  # layout can be 'bshd', 'bhsd', or 'thd'
-        output_dtype=None,
-        max_seqlens_q=0,
-        max_seqlens_k=0,
-        # varlen params
-        cu_seqlens_q=None,  # only 'thd' layout supported for varlen 
-        cu_seqlens_k=None,
-        # quant params
-        q_descale=None,
-        k_descale=None,
-        v_descale=None,
-        p_scale=None,
-        o_scale=None,
-        # bias params
-        bias=None,  # varlen not implemented for bias
-        seqlen_q=None,
-        seqlen_k=None,
-        # alibi params
-        alibi_slopes=None,
-        alibi_batch=None,
-        alibi_nheads=None,
-        # causal
-        causal=None,
-    ):
-        self.sm_scale = sm_scale
-        self.output_dtype = output_dtype
-        self.max_seqlens_q = max_seqlens_q
-        self.max_seqlens_k = max_seqlens_k
-        self.layout = layout
-        if cu_seqlens_q is not None or cu_seqlens_k is not None:
-            assert cu_seqlens_q is not None and cu_seqlens_k is not None
-            assert layout is None or layout not in [
-                'bshd', 'bhsd'
-            ], "Varlen only implemented for thd layout"
-            self.set_varlen_params(cu_seqlens_q, cu_seqlens_k)
-        quant_params = [q_descale, k_descale, v_descale, p_scale, o_scale]
-        if any(x is not None for x in quant_params):
-            p_descale = 1.0 / p_scale if p_scale is not None else None
-            self.set_eight_bit_params(q_descale, k_descale, v_descale, p_scale,
-                                      p_descale, o_scale)
-        if bias is not None:
-            self.need_bias(bias, seqlen_q, seqlen_k)
-        if alibi_slopes is not None:
-            self.need_alibi(alibi_slopes, alibi_batch, alibi_nheads)
-        if causal is not None and causal:
-            self.need_causal()
-
-    def set_varlen_params(self, cu_seqlens_q, cu_seqlens_k):
-        self.varlen = True
-        self.layout = 'thd'
-        self.cu_seqlens_q = cu_seqlens_q
-        self.cu_seqlens_k = cu_seqlens_k
-        # Without "varlen", there should still be one sequence.
-        assert len(cu_seqlens_q) >= 2
-        assert len(cu_seqlens_q) == len(cu_seqlens_k)
-        self.num_contexts = len(cu_seqlens_q) - 1
-        for i in range(0, self.num_contexts):
-            self.max_seqlens_q = max(
-                cu_seqlens_q[i + 1].item() - cu_seqlens_q[i].item(),
-                self.max_seqlens_q)
-            self.max_seqlens_k = max(
-                cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item(),
-                self.max_seqlens_k)
-
-    def set_eight_bit_params(self, q_descale, k_descale, v_descale, p_scale,
-                             p_descale, o_scale):
-        self.eight_bit = True
-        self.q_descale = q_descale
-        self.k_descale = k_descale
-        self.v_descale = v_descale
-        self.p_scale = p_scale
-        self.p_descale = p_descale
-        self.o_scale = o_scale
-        self.use_p_scale = (p_scale is not None) and (
-            p_descale is not None) and (v_descale is not None)
-        self.eight_bit_kv = ((q_descale is None) and (k_descale is not None)
-                             and (v_descale is not None))
-        self.eight_bit_dtype_torch = default_eight_bit_dtype_torch
-
-    def need_bias(self, bias, seqlen_q, seqlen_k):
-        assert bias is not None
-        assert bias.is_cuda
-        assert bias.dim() == 4
-        assert bias.shape[0] == 1
-        assert bias.shape[2:] == (seqlen_q, seqlen_k)
-        self.bias = bias
-
-    def need_alibi(self, alibi_slopes, batch, nheads):
-        assert alibi_slopes.is_cuda
-        assert alibi_slopes.dim() == 2
-        assert alibi_slopes.shape[0] == batch
-        assert alibi_slopes.shape[1] == nheads
-        self.alibi_slopes = alibi_slopes
-
-    def need_causal(self):
-        self.causal = True
-
-    def check_args(self, q, k, v, o):
-        assert q.dim() == k.dim() and q.dim() == v.dim()
-
-        batch, nheads_q, nheads_k, head_size = get_shape_from_layout(
-            q, k, self)
-        if self.varlen:
-            assert q.dim() == 3
-            assert self.cu_seqlens_q is not None
-            assert self.cu_seqlens_k is not None
-            assert len(self.cu_seqlens_q) == len(self.cu_seqlens_k)
-            # TODO: Remove once bias is supported with varlen
-            assert self.bias is None
-            assert not self.return_encoded_softmax
-        else:
-            assert q.dim() == 4
-            assert self.max_seqlens_q > 0 and self.max_seqlens_k > 0
-            assert self.cu_seqlens_q is None and self.cu_seqlens_k is None
-        assert k.shape == v.shape
-        assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
-        # TODO: Change assert if we support qkl f8 and v f16
-        if self.eight_bit:
-            if self.eight_bit_kv:
-                assert (v.dtype == k.dtype
-                        and k.dtype == self.eight_bit_dtype_torch)
-                assert q.dtype != k.dtype
-                assert (self.v_descale is not None) and (self.k_descale
-                                                         is not None)
-            else:
-                assert (q.dtype == k.dtype and q.dtype == v.dtype
-                        and q.dtype == self.eight_bit_dtype_torch)
-                assert (self.q_descale
-                        is not None) and (self.k_descale
-                                          is not None) and (self.v_descale
-                                                            is not None)
-                if self.use_p_scale:
-                    assert (self.p_scale is not None) and (self.p_descale
-                                                           is not None)
-        else:
-            assert (q.dtype == k.dtype) and (q.dtype == v.dtype)
-        assert head_size <= 256
-        assert o.shape == q.shape
-        assert (nheads_q % nheads_k) == 0
-        assert self.layout is not None
-        assert self.layout == 'thd' or not self.varlen
+torch_dtype: tl.constexpr = torch.float16
 
 
 @triton.jit
@@ -243,85 +40,40 @@ def max_fn(x, y):
     return tl.math.max(x, y)
 
 
-# Convenience function to load with optional boundary checks.
-# "First" is the major dim, "second" is the minor dim.
 @triton.jit
-def masked_load(ptrs, offset_first, offset_second, boundary_first,
-                boundary_second):
-    if offset_first is not None and offset_second is not None:
-        mask = (offset_first[:, None] < boundary_first) & \
-               (offset_second[None, :] < boundary_second)
-        tensor = tl.load(ptrs, mask=mask, other=0.0)
-    elif offset_first is not None:
-        mask = offset_first[:, None] < boundary_first
-        tensor = tl.load(ptrs, mask=mask, other=0.0)
-    elif offset_second is not None:
-        mask = offset_second[None, :] < boundary_second
-        tensor = tl.load(ptrs, mask=mask, other=0.0)
-    else:
-        tensor = tl.load(ptrs)
-    return tensor
+def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
+    ms = tl.arange(0, m)
+    ns = tl.arange(0, n)
+    return philox_offset + ms[:, None] * stride + ns[None, :]
 
 
 @triton.jit
-def compute_alibi_block(alibi_slope,
-                        seqlen_q,
-                        seqlen_k,
-                        offs_m,
-                        offs_n,
-                        transpose=False):
-    # when seqlen_k and seqlen_q are different we want the diagonal to stick to
-    # the bottom right of the attention matrix
-    # for casual mask we want something like this where (1 is kept and 0 is
-    # masked)
-    # seqlen_q = 2 and seqlen_k = 5
-    #   1 1 1 1 0
-    #   1 1 1 1 1
-    # seqlen_q = 5 and seqlen_k = 2
-    #        0 0
-    #        0 0
-    #        0 0
-    #        1 0
-    #        1 1
-    # for alibi the diagonal is 0 indicating no penalty for attending to that
-    # spot and increasing penalty for attending further from the diagonal
-    # e.g. alibi_slope = 1, seqlen_q = 2, seqlen_k = 5,
-    # offs_m = [0, 1, 2, 3], offs_n = [0, 1, 2, 3, 4], transpose = False
-    # 1. offs_m[:,None] = [[0],
-    #                       [1],
-    # 2. offs_m[:,None] + seqlen_k = [[5],
-    #                                  [6],
-    # 3. offs_m[:,None] + seqlen_k - seqlen_q = [[3],
-    #                                             [4],
-    # 4. offs_m[:,None] + seqlen_k - seqlen_q - offs_n[None,:] =
-    # [[3], - [[0, 1, 2, 3, 4]] =  [[ 3, 2, 1, 0,-1], [4], [ 4, 3, 2, 1, 0]]
-    # 5. -1 * alibi_slope * tl.abs(relative_pos_block) = [[ -3, -2, -1, 0,-1],
-    #                                                    [ -4, -3, -2, -1, 0]],
-    relative_pos_block = (offs_m[:, None] + seqlen_k - seqlen_q -
-                          offs_n[None, :])
-    alibi_block = -1 * alibi_slope * tl.abs(relative_pos_block)
-    if transpose:
-        return alibi_block.T
-    else:
-        return alibi_block
+def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
+    rng_offsets = dropout_offsets(philox_seed, philox_offset, dropout_p, m, n,
+                                  stride).to(tl.uint32)
+    # TODO: use tl.randint for better performance
+    return tl.rand(philox_seed, rng_offsets)
 
 
-def compute_alibi_tensor(alibi_slopes, seqlen_q, seqlen_k):
-    q_idx = torch.arange(seqlen_q, dtype=torch.int32,
-                         device="cuda").unsqueeze(-1)  # (N_CTX_Q, 1)
-    k_idx = torch.arange(seqlen_k, dtype=torch.int32,
-                         device="cuda").unsqueeze(0)  # (1, N_CTX_K)
-    relative_pos = torch.abs(q_idx + seqlen_k - seqlen_q -
-                             k_idx)  # (N_CTX_Q, N_CTX_K)
-    return -1 * alibi_slopes.unsqueeze(-1).unsqueeze(
-        -1) * relative_pos  # (Z, H, N_CTX_Q, N_CTX_K)
+@triton.jit
+def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
+    rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n,
+                             stride)
+    rng_keep = rng_output > dropout_p
+    return rng_keep
 
 
 @triton.jit
-def quant_fp8(x, scale):
-    x *= scale
-    x = tl.clamp(x, FP8_MIN, FP8_MAX)
-    return x
+def load_fn(block_ptr, first, second, pad):
+    if first and second:
+        tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad)
+    elif first:
+        tensor = tl.load(block_ptr, boundary_check=(0, ), padding_option=pad)
+    elif second:
+        tensor = tl.load(block_ptr, boundary_check=(1, ), padding_option=pad)
+    else:
+        tensor = tl.load(block_ptr)
+    return tensor
 
 
 @triton.jit
@@ -330,68 +82,61 @@ def _attn_fwd_inner(
     l_i,
     m_i,
     q,
-    k_ptrs,
-    v_ptrs,
-    bias_ptrs,
-    stride_kn,
-    stride_vk,
-    stride_bn,
+    K_block_ptr,
+    V_block_ptr,
     start_m,
     actual_seqlen_k,
-    actual_seqlen_q,
+    dropout_p,
     philox_seed,
     batch_philox_offset,
-    encoded_sm_ptrs,
+    encoded_softmax_block_ptr,
     block_min,
     block_max,
     offs_n_causal,
     masked_blocks,
     n_extra_tokens,
-    alibi_slope,
-    q_descale,
-    k_descale,
-    v_descale,
-    p_scale,
+    bias_ptr,
     IS_CAUSAL: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
     BLOCK_N: tl.constexpr,
     OFFS_M: tl.constexpr,
     OFFS_N: tl.constexpr,
-    SHOULD_PRE_LOAD_V: tl.constexpr,
-    SHOULD_MASK_STEPS: tl.constexpr,
-    SHOULD_RETURN_ENCODED_SOFTMAX: tl.constexpr,
-    USE_PADDED_HEAD: tl.constexpr,
-    IS_ACTUAL_BLOCK_DMODEL: tl.constexpr,
-    QK_SCALE: tl.constexpr,
-    IS_EIGHT_BIT_GEMM: tl.constexpr,
-    USE_P_SCALE: tl.constexpr,
-    IS_EIGHT_BIT_KV: tl.constexpr,
-    QUANT_DTYPE: tl.constexpr = default_eight_bit_dtype_triton,
+    PRE_LOAD_V: tl.constexpr,
+    MASK_STEPS: tl.constexpr,
+    ENABLE_DROPOUT: tl.constexpr,
+    RETURN_ENCODED_SOFTMAX: tl.constexpr,
+    PADDED_HEAD: tl.constexpr,
+    USE_FP8: tl.constexpr,
+    qk_scale,
+    p_descale,
 ):
-
     # loop over k, v, and update accumulator
     for start_n in range(block_min, block_max, BLOCK_N):
         # For padded blocks, we will overrun the tensor size if
         # we load all BLOCK_N. For others, the blocks are all within range.
-        k_offs_n = start_n + tl.arange(0,
-                                       BLOCK_N) if SHOULD_MASK_STEPS else None
-        k_offs_k = None if not USE_PADDED_HEAD else tl.arange(0, BLOCK_DMODEL)
-        k = masked_load(k_ptrs, k_offs_k, k_offs_n, IS_ACTUAL_BLOCK_DMODEL,
-                        actual_seqlen_k)
-        if SHOULD_PRE_LOAD_V:
-            # We can use the same offsets as k, just with dims transposed.
-            v = masked_load(v_ptrs, k_offs_n, k_offs_k, actual_seqlen_k,
-                            IS_ACTUAL_BLOCK_DMODEL)
+        k = load_fn(
+            K_block_ptr,
+            PADDED_HEAD,
+            MASK_STEPS and (n_extra_tokens != 0),
+            "zero",
+        )
+        if PRE_LOAD_V:
+            v = load_fn(
+                V_block_ptr,
+                MASK_STEPS and (n_extra_tokens != 0),
+                PADDED_HEAD,
+                "zero",
+            )
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         # We start from end of seqlen_k so only the first iteration would need
         # to be checked for padding if it is not a multiple of block_n
         # TODO: This can be optimized to only be true for the padded block.
-        if SHOULD_MASK_STEPS:  # noqa: SIM102
+        if MASK_STEPS:  # noqa: SIM102
             # If this is the last block / iteration, we want to
             # mask if the sequence length is not a multiple of block size
-            # a solution is to always do BLOCK_M // BLOCK_N + 1 steps if not
-            # is_modulo_mn. last step might get wasted but that is okay.
+            # a solution is to always do BLOCK_M // BLOCK_N + 1 steps
+            # if not is_modulo_mn. last step might get wasted but that is okay.
             # check if this masking works for that case.
             if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0):
                 boundary_m = tl.full([BLOCK_M],
@@ -404,107 +149,112 @@ def _attn_fwd_inner(
             causal_boundary = start_n + offs_n_causal
             causal_mask = OFFS_M[:, None] >= causal_boundary[None, :]
             qk = tl.where(causal_mask, qk, float("-inf"))
-
         # -- compute qk ----
-        if IS_EIGHT_BIT_GEMM:
-            qk += ((((tl.dot(q, k).to(tl.float32) * q_descale)) * k_descale) *
-                   QK_SCALE)
-        else:
-            if IS_EIGHT_BIT_KV:
-                k = (k * k_descale).to(q.type.element_ty)
-            qk += (tl.dot(q, k) * QK_SCALE)
-
-        if bias_ptrs is not None:
-            bias_offs_n = start_n + tl.arange(
-                0, BLOCK_N) if SHOULD_MASK_STEPS else None
-            bias = masked_load(bias_ptrs, OFFS_M, bias_offs_n, actual_seqlen_q,
-                               actual_seqlen_k)
-            # While bias is added after multiplying qk with sm_scale,
-            # our optimization to use 2^x instead of e^x results in an
-            # additional scale factor of log2(e) which we must also multiply
-            # the bias with.
-            qk += (bias * 1.44269504089)
-
-        if alibi_slope is not None:
-            # Compute the global position of each token within the sequence
-            global_m_positions = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-            global_n_positions = start_n + tl.arange(0, BLOCK_N)
-            alibi_block = compute_alibi_block(alibi_slope, actual_seqlen_q,
-                                              actual_seqlen_k,
-                                              global_m_positions,
-                                              global_n_positions)
-            qk += (alibi_block * 1.44269504089)  # scale factor of log2(e)
-
-        # softmax
+        qk += tl.dot(q, k)
+        if USE_FP8:
+            qk *= qk_scale
+        if bias_ptr is not None:
+            bias = load_fn(bias_ptr, False, MASK_STEPS
+                           and (n_extra_tokens != 0), "zero")
+            # While bias is added after multiplying qk with sm_scale, our
+            # optimization to use 2^x instead of e^x results in an additional
+            # scale factor of log2(e) which we must also multiply the bias with.
+            qk += bias * 1.44269504089
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         qk = qk - m_ij[:, None]
         p = tl.math.exp2(qk)
 
         # CAVEAT: Must update l_ij before applying dropout
         l_ij = tl.sum(p, 1)
-        if SHOULD_RETURN_ENCODED_SOFTMAX:
-            tl.store(encoded_sm_ptrs, p.to(encoded_sm_ptrs.type.element_ty))
+        if ENABLE_DROPOUT:
+            philox_offset = (batch_philox_offset +
+                             start_m * BLOCK_M * actual_seqlen_k + start_n -
+                             BLOCK_N)
+            keep = dropout_mask(
+                philox_seed,
+                philox_offset,
+                dropout_p,
+                BLOCK_M,
+                BLOCK_N,
+                actual_seqlen_k,
+            )
+            if RETURN_ENCODED_SOFTMAX:
+                tl.store(
+                    encoded_softmax_block_ptr,
+                    tl.where(keep, p,
+                             -p).to(encoded_softmax_block_ptr.type.element_ty),
+                )
+            p = tl.where(keep, p, 0.0)
+        elif RETURN_ENCODED_SOFTMAX:
+            tl.store(
+                encoded_softmax_block_ptr,
+                p.to(encoded_softmax_block_ptr.type.element_ty),
+            )
         # -- update output accumulator --
         alpha = tl.math.exp2(m_i - m_ij)
         acc = acc * alpha[:, None]
-        if not SHOULD_PRE_LOAD_V:
-            v = masked_load(v_ptrs, k_offs_n, k_offs_k, actual_seqlen_k,
-                            IS_ACTUAL_BLOCK_DMODEL)
+        if not PRE_LOAD_V:
+            v = load_fn(
+                V_block_ptr,
+                MASK_STEPS and (n_extra_tokens != 0),
+                PADDED_HEAD,
+                "zero",
+            )
         # -- update m_i and l_i
         l_i = l_i * alpha + l_ij
         # update m_i and l_i
         m_i = m_ij
 
-        if IS_EIGHT_BIT_GEMM:
-            if USE_P_SCALE:
-                p = quant_fp8(p, p_scale).to(QUANT_DTYPE)
-                acc += tl.dot(p, v)
-            else:
-                # v is in eight_bit but p is not, we want the gemm in p's type
-                acc += tl.dot(p, v.to(p.type.element_ty))
-        else:
-            if IS_EIGHT_BIT_KV:
-                v = (v * v_descale).to(p.type.element_ty)
-            acc += tl.dot(p.to(v.type.element_ty), v)
-
-        k_ptrs += BLOCK_N * stride_kn
-        v_ptrs += BLOCK_N * stride_vk
-        if bias_ptrs is not None:
-            bias_ptrs += BLOCK_N * stride_bn
-        if SHOULD_RETURN_ENCODED_SOFTMAX:
-            encoded_sm_ptrs += BLOCK_N
+        if USE_FP8:
+            p *= p_descale
+
+        acc += tl.dot(p.to(V_block_ptr.type.element_ty), v)
+
+        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+        if bias_ptr is not None:
+            bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N))
+        if RETURN_ENCODED_SOFTMAX:
+            encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr,
+                                                   (0, BLOCK_N))
     return acc, l_i, m_i
 
 
 def get_cdna_autotune_configs():
     return [
+        triton.Config(
+            {
+                'BLOCK_M': 256,
+                'BLOCK_N': 64,
+                'waves_per_eu': 2,
+                'PRE_LOAD_V': False
+            },
+            num_stages=1,
+            num_warps=8),
         triton.Config(
             {
                 'BLOCK_M': 128,
                 'BLOCK_N': 128,
                 'waves_per_eu': 2,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'PRE_LOAD_V': False
             },
             num_stages=1,
             num_warps=4),
         triton.Config(
             {
-                'BLOCK_M': 128,
-                'BLOCK_N': 64,
+                'BLOCK_M': 256,
+                'BLOCK_N': 128,
                 'waves_per_eu': 2,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'PRE_LOAD_V': False
             },
             num_stages=1,
-            num_warps=4),
+            num_warps=8),
         triton.Config(
             {
                 'BLOCK_M': 128,
                 'BLOCK_N': 64,
-                'waves_per_eu': 3,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'waves_per_eu': 1,
+                'PRE_LOAD_V': False
             },
             num_stages=1,
             num_warps=4),
@@ -512,168 +262,141 @@ def get_cdna_autotune_configs():
             {
                 'BLOCK_M': 128,
                 'BLOCK_N': 64,
-                'waves_per_eu': 1,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'waves_per_eu': 3,
+                'PRE_LOAD_V': True
             },
             num_stages=1,
             num_warps=4),
         triton.Config(
             {
                 'BLOCK_M': 128,
-                'BLOCK_N': 32,
-                'waves_per_eu': 2,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'BLOCK_N': 64,
+                'waves_per_eu': 3,
+                'PRE_LOAD_V': False
             },
             num_stages=1,
             num_warps=4),
-    ], [
-        'IS_CAUSAL', 'MAX_SEQLENS_Q', 'MAX_SEQLENS_K',
-        'IS_ACTUAL_BLOCK_DMODEL', 'VARLEN', 'HQ', 'HK'
-    ]
-
-
-def get_rdna_autotune_configs():
-    return [
         triton.Config(
             {
-                'BLOCK_M': 32,
-                'BLOCK_N': 32,
+                'BLOCK_M': 64,
+                'BLOCK_N': 64,
                 'waves_per_eu': 4,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'PRE_LOAD_V': False
             },
             num_stages=1,
-            num_warps=2),
+            num_warps=8),
         triton.Config(
             {
                 'BLOCK_M': 32,
                 'BLOCK_N': 32,
-                'waves_per_eu': 2,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'waves_per_eu': 4,
+                'PRE_LOAD_V': False
             },
             num_stages=1,
-            num_warps=2),
+            num_warps=8),
+        # TODO: This config fails with head_size not pow2 with data mismatches.
+        #    triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1,
+        #                   'PRE_LOAD_V': False}, num_stages=1, num_warps=4),
+
+        # Fails in AccelerateAMDMatmul (Triton) assert when using FP8:
+        # triton.Config(
+        #     {
+        #         "BLOCK_M": 16,
+        #         "BLOCK_N": 16,
+        #         "waves_per_eu": 1,
+        #         "PRE_LOAD_V": False,
+        #     },
+        #     num_stages=1,
+        #     num_warps=4,
+        # ),
+    ], ['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL', 'USE_FP8']
+
+
+def get_rdna_autotune_configs():
+    return [
         triton.Config(
             {
                 'BLOCK_M': 32,
-                'BLOCK_N': 16,
+                'BLOCK_N': 32,
                 'waves_per_eu': 4,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'PRE_LOAD_V': False
             },
             num_stages=1,
             num_warps=2),
         triton.Config(
             {
                 'BLOCK_M': 32,
-                'BLOCK_N': 16,
+                'BLOCK_N': 32,
                 'waves_per_eu': 2,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'PRE_LOAD_V': False
             },
             num_stages=1,
             num_warps=2),
         triton.Config(
             {
-                'BLOCK_M': 16,
+                'BLOCK_M': 32,
                 'BLOCK_N': 16,
                 'waves_per_eu': 4,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'PRE_LOAD_V': False
             },
             num_stages=1,
             num_warps=2),
         triton.Config(
             {
-                'BLOCK_M': 16,
+                'BLOCK_M': 32,
                 'BLOCK_N': 16,
                 'waves_per_eu': 2,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
-            },
-            num_stages=1,
-            num_warps=2),
-        # Fall-back config.
-        triton.Config(
-            {
-                'BLOCK_M': 16,
-                'BLOCK_N': 16,
-                'waves_per_eu': 1,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'PRE_LOAD_V': False
             },
             num_stages=1,
             num_warps=2),
-    ], [
-        'IS_CAUSAL', 'MAX_SEQLENS_Q', 'MAX_SEQLENS_K',
-        'IS_ACTUAL_BLOCK_DMODEL', 'VARLEN', 'HQ', 'HK'
-    ]
-
-
-def get_general_autotune_configs():
-    return [
-        triton.Config(
-            {
-                'BLOCK_M': 128,
-                'BLOCK_N': 128,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
-            },
-            num_stages=1,
-            num_warps=4),
-        triton.Config(
-            {
-                'BLOCK_M': 128,
-                'BLOCK_N': 64,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
-            },
-            num_stages=1,
-            num_warps=4),
-        triton.Config(
-            {
-                'BLOCK_M': 128,
-                'BLOCK_N': 32,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
-            },
-            num_stages=1,
-            num_warps=4),
-    ], [
-        'IS_CAUSAL', 'MAX_SEQLENS_Q', 'MAX_SEQLENS_K',
-        'IS_ACTUAL_BLOCK_DMODEL', 'VARLEN', 'HQ', 'HK'
-    ]
-
-
-def has_cdna_target():
-    ROCM_CDNA_TARGETS = ["gfx942", "gfx90a", "gfx908"]
-    return triton.runtime.driver.active.get_current_target(
-    ).arch in ROCM_CDNA_TARGETS
-
-
-def is_rocm_cdna():
-    return current_platform.is_rocm() and has_cdna_target()
+        # Fails in AccelerateAMDMatmul (Triton) assert when using FP8:
+        # triton.Config(
+        #     {
+        #         'BLOCK_M': 16,
+        #         'BLOCK_N': 16,
+        #         'waves_per_eu': 4,
+        #         'PRE_LOAD_V': False
+        #     },
+        #     num_stages=1,
+        #     num_warps=2),
+        # triton.Config(
+        #     {
+        #         'BLOCK_M': 16,
+        #         'BLOCK_N': 16,
+        #         'waves_per_eu': 2,
+        #         'PRE_LOAD_V': False
+        #     },
+        #     num_stages=1,
+        #     num_warps=2),
+        # # Fall-back config.
+        # triton.Config(
+        #     {
+        #         'BLOCK_M': 16,
+        #         'BLOCK_N': 16,
+        #         'waves_per_eu': 1,
+        #         'PRE_LOAD_V': False
+        #     },
+        #     num_stages=1,
+        #     num_warps=2),
+    ], ['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL', 'USE_FP8']
 
 
 def get_autotune_configs():
-    if is_rocm_cdna():
-        return get_cdna_autotune_configs()
-    elif current_platform.is_rocm():
+    if on_gfx1x():
         return get_rdna_autotune_configs()
     else:
-        return get_general_autotune_configs()
+        return get_cdna_autotune_configs()
 
 
 autotune_configs, autotune_keys = get_autotune_configs()
 
+float8_info = torch.finfo(current_platform.fp8_dtype())
+
 
 @triton.autotune(
     configs=autotune_configs,
     key=autotune_keys,
-    use_cuda_graph=True,
 )
 @triton.jit
 def attn_fwd(
@@ -681,7 +404,13 @@ def attn_fwd(
     K,
     V,
     bias,
-    SM_SCALE: tl.constexpr,
+    sm_scale,
+    q_scale,
+    k_scale,
+    v_scale,
+    p_scale,
+    p_descale,
+    o_descale,
     L,
     Out,
     stride_qz: tl.int64,
@@ -704,70 +433,44 @@ def attn_fwd(
     stride_bh: tl.int64,
     stride_bm: tl.int64,
     stride_bn: tl.int64,
-    stride_az: tl.int64,
-    stride_ah: tl.int64,
-    q_descale_ptr,
-    k_descale_ptr,
-    p_scale_ptr,
-    p_descale_ptr,
-    o_descale_ptr,
-    v_descale_ptr,
-    q_descale_has_singleton: tl.constexpr,
-    k_descale_has_singleton: tl.constexpr,
-    p_descale_has_singleton: tl.constexpr,
-    v_descale_has_singleton: tl.constexpr,
     cu_seqlens_q,
     cu_seqlens_k,
+    dropout_p,
     philox_seed,
-    NUM_CU: tl.constexpr,
-    GRID_CU_MULTIP: tl.constexpr,
-    B: tl.constexpr,
     philox_offset_base,
     encoded_softmax,
-    alibi_slopes,
     HQ: tl.constexpr,
     HK: tl.constexpr,
-    IS_ACTUAL_BLOCK_DMODEL: tl.constexpr,
+    ACTUAL_BLOCK_DMODEL: tl.constexpr,
     MAX_SEQLENS_Q: tl.constexpr,
     MAX_SEQLENS_K: tl.constexpr,
     VARLEN: tl.constexpr,
     IS_CAUSAL: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
+    USE_FP8: tl.constexpr,
+    USE_FP8_OUT: tl.constexpr,
     BLOCK_N: tl.constexpr,
-    SHOULD_PRE_LOAD_V: tl.constexpr,
-    USE_BIAS: tl.constexpr,
-    SHOULD_RETURN_ENCODED_SOFTMAX: tl.constexpr,
-    USE_ALIBI: tl.constexpr,
-    IS_EIGHT_BIT: tl.constexpr,
-    USE_P_SCALE: tl.constexpr,
-    IS_EIGHT_BIT_KV: tl.constexpr,
-    QUANT_DTYPE: tl.constexpr = default_eight_bit_dtype_triton,
+    PRE_LOAD_V: tl.constexpr,
+    BIAS_TYPE: tl.constexpr,
+    ENABLE_DROPOUT: tl.constexpr,
+    RETURN_ENCODED_SOFTMAX: tl.constexpr,
+    FP8_MIN: tl.constexpr = float8_info.min,
+    FP8_MAX: tl.constexpr = float8_info.max,
 ):
-
-    if o_descale_ptr is not None:
-        o_descale = tl.load(o_descale_ptr)
-
-    start_m: tl.int64 = tl.program_id(0)
-    off_h_q: tl.int64 = tl.program_id(1)
-    off_z: tl.int64 = tl.program_id(2)
-
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M).to(tl.int64)
-    offs_n = tl.arange(0, BLOCK_N).to(tl.int64)
-    offs_d = tl.arange(0, BLOCK_DMODEL).to(tl.int64)
-
-    # as we can't have return statements inside while loop in Triton
-    continue_condition = True
-
+    start_m = tl.program_id(0)
+    off_h_q = tl.program_id(1)
+    off_z = tl.program_id(2)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
     if VARLEN:
         cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
         cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
         seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start
-        # We have a one-size-fits-all grid in id(0). Some seqlens might be
-        # too small for all start_m so for those we return early.
+        # We have a one-size-fits-all grid in id(0). Some seqlens might be too
+        # small for all start_m so for those we return early.
         if start_m * BLOCK_M > seqlen_q:
-            continue_condition = False
-            # return
+            return
         cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
         cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
         seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start
@@ -777,598 +480,499 @@ def attn_fwd(
         seqlen_q = MAX_SEQLENS_Q
         seqlen_k = MAX_SEQLENS_K
 
-    if continue_condition:
-        # Now we compute whether we need to exit early due to causal
-        # masking. This is because for seqlen_q > seqlen_k, M rows of the
-        # attn scores are completely masked, resulting in 0s written to the
-        # output, and inf written to LSE. We don't need to do any GEMMs in
-        # this case. This block of code determines what N is, and if this
-        # WG is operating on those M rows.
-        n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
-        if (IS_CAUSAL):
-            # If seqlen_q == seqlen_k, the attn scores are a square matrix.
-            # If seqlen_q != seqlen_k, attn scores are rectangular which
-            # means the causal mask boundary is bottom right aligned, and
-            # ends at either the top edge (seqlen_q < seqlen_k) or left
-            # edge. This captures the decrease in n_blocks if we have a
-            # rectangular attn matrix
-            n_blocks_seqlen = cdiv_fn(
-                (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N)
-            # This is what adjusts the block_max for the current WG, only
-            # if IS_CAUSAL. Otherwise we want to always iterate through all
-            # n_blocks
-            n_blocks = min(n_blocks, n_blocks_seqlen)
-            # If we have no blocks after adjusting for seqlen deltas, this
-            # WG is part of the blocks that are all 0. We exit early.
-            if n_blocks <= 0:
-                o_offset = (Out + off_z * stride_oz + off_h_q * stride_oh +
-                            cu_seqlens_q_start * stride_om)
-                o_ptrs = (o_offset + offs_m[:, None] * stride_om +
-                          offs_d[None, :] * stride_on)
-                acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-                o_ptrs_mask = (offs_m[:, None] < seqlen_q).broadcast_to(
-                    [BLOCK_M, BLOCK_DMODEL])
-                # We still need to write 0s to the result
-                tl.store(o_ptrs, acc, mask=o_ptrs_mask)
-                # The tensor allocated for L is based on MAX_SEQLENS_Q as
-                # that is statically known.
-                l_ptrs = (L + off_z * HQ * MAX_SEQLENS_Q +
-                          off_h_q * MAX_SEQLENS_Q + offs_m)
-                # We store inf to LSE, not -inf because in the bwd pass,
-                # we subtract this from qk which makes it -inf, such that
-                # exp(qk - inf) = 0 for these masked blocks.
-                l_value = tl.full([BLOCK_M],
-                                  value=float("inf"),
-                                  dtype=tl.float32)
-                l_ptrs_mask = offs_m < MAX_SEQLENS_Q
-                tl.store(l_ptrs, l_value, mask=l_ptrs_mask)
-                # TODO: Should dropout and return encoded softmax be
-                # handled here too?
-                continue_condition = False
-                # return
-
-        if continue_condition:
-            # If MQA / GQA, set the K and V head offsets appropriately.
-            GROUP_SIZE: tl.constexpr = HQ // HK
-            off_h_k = off_h_q // GROUP_SIZE if GROUP_SIZE != 1 else off_h_q
-            n_extra_tokens = 0
-            if seqlen_k < BLOCK_N:
-                n_extra_tokens = BLOCK_N - seqlen_k
-            elif seqlen_k % BLOCK_N:
-                n_extra_tokens = seqlen_k % BLOCK_N
-            USE_PADDED_HEAD: tl.constexpr = (IS_ACTUAL_BLOCK_DMODEL
-                                             != BLOCK_DMODEL)
-
-            # Compute pointers for all the tensors used in this kernel.
-            q_offset = (Q + off_z * stride_qz + off_h_q * stride_qh +
-                        cu_seqlens_q_start * stride_qm)
-            q_ptrs = (q_offset + offs_m[:, None] * stride_qm +
-                      offs_d[None, :] * stride_qk)
-            k_offset = (K + off_z * stride_kz + off_h_k * stride_kh +
-                        cu_seqlens_k_start * stride_kn)
-            k_ptrs = (k_offset + offs_d[:, None] * stride_kk +
-                      offs_n[None, :] * stride_kn)
-            v_offset = (V + off_z * stride_vz + off_h_k * stride_vh +
-                        cu_seqlens_k_start * stride_vk)
-            v_ptrs = (v_offset + offs_n[:, None] * stride_vk +
-                      offs_d[None, :] * stride_vn)
-            # Compute pointers for all scale tensors used in this kernel.
-
-            IS_EIGHT_BIT_GEMM: tl.constexpr = IS_EIGHT_BIT & (
-                not IS_EIGHT_BIT_KV)
-            if IS_EIGHT_BIT:
-                if k_descale_has_singleton:
-                    k_descale_ptrs = k_descale_ptr
-                else:
-                    k_descale_ptrs = k_descale_ptr + off_h_k
-
-                if v_descale_has_singleton:
-                    v_descale_ptrs = v_descale_ptr
-                else:
-                    v_descale_ptrs = v_descale_ptr + off_h_k
-
-                if not IS_EIGHT_BIT_KV:
-                    if q_descale_has_singleton:
-                        q_descale_ptrs = q_descale_ptr
-                    else:
-                        q_descale_ptrs = q_descale_ptr + off_h_q
-                if USE_P_SCALE:
-                    if p_descale_has_singleton:
-                        p_scale_ptrs = p_scale_ptr
-                        p_descale_ptrs = p_descale_ptr
-                    else:
-                        p_scale_ptrs = p_scale_ptr + off_h_q
-                        p_descale_ptrs = p_descale_ptr + off_h_q
-
-            if USE_BIAS:
-                bias_offset = off_h_q * stride_bh
-                bias_ptrs = (bias + bias_offset + offs_m[:, None] * stride_bm +
-                             offs_n[None, :] * stride_bn)
-            else:
-                bias_ptrs = None
-
-            if USE_ALIBI:
-                a_offset = off_z * stride_az + off_h_q * stride_ah
-                alibi_slope = tl.load(alibi_slopes + a_offset)
-            else:
-                alibi_slope = None
-
-            batch_philox_offset = 0
-            # We can ask to return the dropout mask without doing any
-            # dropout. In this case, we return an invalid pointer so
-            # indicate the mask is not valid.
-            if SHOULD_RETURN_ENCODED_SOFTMAX:
-                encoded_sm_base = (encoded_softmax +
-                                   off_h_q * seqlen_q * seqlen_k)
-                encoded_sm_ptrs = (encoded_sm_base +
-                                   offs_m[:, None] * seqlen_k +
-                                   offs_n[None, :])
-            else:
-                encoded_sm_ptrs = None
-            # initialize pointer to m and l
-            m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
-            l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
-            acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-            # scale sm_scale by log_2(e) and use 2^x in the loop as we do
-            # not have native e^x support in HW.
-            QK_SCALE: tl.constexpr = SM_SCALE * 1.44269504089
-            # Q is loaded once at the beginning and shared by all N blocks.
-            q_ptrs_mask = offs_m[:, None] < seqlen_q
-            if USE_PADDED_HEAD:
-                q_ptrs_mask = q_ptrs_mask & (offs_d[None, :]
-                                             < IS_ACTUAL_BLOCK_DMODEL)
-            q = tl.load(q_ptrs, mask=q_ptrs_mask, other=0.0)
-
-            if IS_EIGHT_BIT:
-                k_descale = tl.load(k_descale_ptrs)
-                v_descale = tl.load(v_descale_ptrs)
-                q_descale = None if IS_EIGHT_BIT_KV else tl.load(
-                    q_descale_ptrs)
-                if USE_P_SCALE:
-                    p_scale = tl.load(p_scale_ptrs)
-                    p_descale = tl.load(p_descale_ptrs)
-                else:
-                    p_scale = None
-                    p_descale = None
-            else:
-                q_descale = None
-                k_descale = None
-                v_descale = None
-                p_scale = None
-                p_descale = None
-            # Here we compute how many full and masked blocks we have.
-            padded_block_k = n_extra_tokens != 0
-            is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0)
-            if IS_CAUSAL:
-                # There are always at least BLOCK_M // BLOCK_N masked
-                # blocks.  Additionally there might be one more due to
-                # dissimilar seqlens.
-                masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn)
-            else:
-                # Padding on Q does not need to be masked in the FA loop.
-                masked_blocks = padded_block_k
-            # if IS_CAUSAL, not is_modulo_mn does not always result in an
-            # additional block. In this case we might exceed n_blocks so
-            # pick the min.
-            masked_blocks = min(masked_blocks, n_blocks)
-            n_full_blocks = n_blocks - masked_blocks
-            block_min = 0
-            block_max = n_blocks * BLOCK_N
-            # Compute for full blocks. Here we set causal to false
-            # regardless of its actual value because there is no masking.
-            # Similarly we do not need padding.
-            if n_full_blocks > 0:
-                block_max = (n_blocks - masked_blocks) * BLOCK_N
-                acc, l_i, m_i = _attn_fwd_inner(
-                    acc,
-                    l_i,
-                    m_i,
-                    q,
-                    k_ptrs,
-                    v_ptrs,
-                    bias_ptrs,
-                    stride_kn,
-                    stride_vk,
-                    stride_bn,
-                    start_m,
-                    seqlen_k,
-                    seqlen_q,
-                    philox_seed,
-                    batch_philox_offset,
-                    encoded_sm_ptrs,
-                    # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
-                    block_min,
-                    block_max,
-                    0,
-                    0,
-                    0,
-                    alibi_slope,
-                    q_descale,
-                    k_descale,
-                    v_descale,
-                    p_scale,
-                    # IS_CAUSAL, ....
-                    False,
-                    BLOCK_M,
-                    BLOCK_DMODEL,
-                    BLOCK_N,
-                    offs_m,
-                    offs_n,
-                    # _, SHOULD_MASK_STEPS, ...
-                    SHOULD_PRE_LOAD_V,
-                    False,
-                    SHOULD_RETURN_ENCODED_SOFTMAX,
-                    USE_PADDED_HEAD,
-                    IS_ACTUAL_BLOCK_DMODEL,
-                    QK_SCALE,
-                    IS_EIGHT_BIT_GEMM,
-                    USE_P_SCALE,
-                    IS_EIGHT_BIT_KV,
-                    QUANT_DTYPE)
-                block_min = block_max
-                block_max = n_blocks * BLOCK_N
-
-            tl.debug_barrier()
-            # Remaining blocks, if any, are full / not masked.
-            if (masked_blocks > 0):
-                if IS_CAUSAL:
-                    offs_n_causal = offs_n + (seqlen_q - seqlen_k)
-                else:
-                    offs_n_causal = 0
-                k_ptrs += n_full_blocks * BLOCK_N * stride_kn
-                v_ptrs += n_full_blocks * BLOCK_N * stride_vk
-                if USE_BIAS:
-                    bias_ptrs += n_full_blocks * BLOCK_N * stride_bn
-                if SHOULD_RETURN_ENCODED_SOFTMAX:
-                    encoded_sm_ptrs += n_full_blocks * BLOCK_N
-                acc, l_i, m_i = _attn_fwd_inner(
-                    acc,
-                    l_i,
-                    m_i,
-                    q,
-                    k_ptrs,
-                    v_ptrs,
-                    bias_ptrs,
-                    stride_kn,
-                    stride_vk,
-                    stride_bn,
-                    start_m,
-                    seqlen_k,
-                    seqlen_q,
-                    philox_seed,
-                    batch_philox_offset,
-                    encoded_sm_ptrs,
-                    block_min,
-                    block_max,
-                    offs_n_causal,
-                    masked_blocks,
-                    n_extra_tokens,
-                    alibi_slope,
-                    q_descale,
-                    k_descale,
-                    v_descale,
-                    p_scale,
-                    IS_CAUSAL,
-                    BLOCK_M,
-                    BLOCK_DMODEL,
-                    BLOCK_N,
-                    offs_m,
-                    offs_n,
-                    # _, SHOULD_MASK_STEPS, ...
-                    SHOULD_PRE_LOAD_V,
-                    True,
-                    SHOULD_RETURN_ENCODED_SOFTMAX,
-                    USE_PADDED_HEAD,
-                    IS_ACTUAL_BLOCK_DMODEL,
-                    QK_SCALE,
-                    IS_EIGHT_BIT_GEMM,
-                    USE_P_SCALE,
-                    IS_EIGHT_BIT_KV,
-                    QUANT_DTYPE)
-
-            if IS_EIGHT_BIT and not IS_EIGHT_BIT_KV:
-                if USE_P_SCALE:
-                    acc *= p_descale
-                acc *= v_descale
-
-            # epilogue
-            # This helps the compiler do Newton Raphson on l_i vs on acc
-            # which is much larger.
-            l_recip = 1 / l_i[:, None]
-            acc = acc * l_recip
-
-            # If seqlen_q > seqlen_k but the delta is not a multiple of
-            # BLOCK_M, then we have one block with a row of all NaNs which
-            # come from computing softmax over a row of all
-            # -infs (-inf - inf = NaN). We check for that here and store 0s
-            # where there are NaNs as these rows should've been zeroed out.
-            end_m_idx = (start_m + 1) * BLOCK_M
-            start_m_idx = start_m * BLOCK_M
-            causal_start_idx = seqlen_q - seqlen_k
-            if IS_EIGHT_BIT and not IS_EIGHT_BIT_KV:  # noqa: SIM102
-                if o_descale_ptr is not None:
-                    acc = quant_fp8(acc, o_descale)
-
-            acc = acc.to(Out.type.element_ty)
-            if IS_CAUSAL:  # noqa: SIM102
-                if (causal_start_idx > start_m_idx
-                        and causal_start_idx < end_m_idx):
-                    out_mask_boundary = tl.full((BLOCK_DMODEL, ),
-                                                causal_start_idx,
-                                                dtype=tl.int32)
-                    mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
-                    out_ptrs_mask = (mask_m_offsets[:, None]
-                                     >= out_mask_boundary[None, :])
-                    z = tl.zeros((1, ), tl.float32)
-                    acc = tl.where(out_ptrs_mask, acc,
-                                   z.to(acc.type.element_ty))
-            # write back LSE
-            l_ptrs = (L + off_z * HQ * MAX_SEQLENS_Q +
-                      off_h_q * MAX_SEQLENS_Q + offs_m)
-            # If seqlen_q not multiple of BLOCK_M, we need to mask out the
-            # last few rows. This is only true for the last M block.
-            # For others, overflow_size will be -ve
-            overflow_size = end_m_idx - seqlen_q
-            if overflow_size > 0:
-                boundary = tl.full((BLOCK_M, ),
-                                   BLOCK_M - overflow_size,
-                                   dtype=tl.int32)
-                l_ptrs_mask = tl.arange(0, BLOCK_M) < boundary
-                tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
-            else:
-                tl.store(l_ptrs, m_i + tl.math.log2(l_i))
-
-            # write back O
-            o_offset = (Out + off_z * stride_oz + off_h_q * stride_oh +
-                        cu_seqlens_q_start * stride_om)
-            o_ptrs = (o_offset + offs_m[:, None] * stride_om +
-                      offs_d[None, :] * stride_on)
-            o_ptrs_mask = tl.full([BLOCK_M, BLOCK_DMODEL], 1, dtype=tl.int1)
-            if overflow_size > 0:
-                o_ptrs_mask = o_ptrs_mask & (offs_m[:, None] < seqlen_q)
-            if USE_PADDED_HEAD:
-                o_ptrs_mask = o_ptrs_mask & (offs_d[None, :]
-                                             < IS_ACTUAL_BLOCK_DMODEL)
-            tl.store(o_ptrs, acc.to(Out.dtype.element_ty), mask=o_ptrs_mask)
-
-
-def get_shape_from_layout(q, k, metadata):
-    assert metadata.layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
-
-    if metadata.layout == 'thd':
-        nheads_q, nheads_k = q.shape[1], k.shape[1]
-        head_size = q.shape[-1]
-        batch = metadata.num_contexts
-    elif metadata.layout == 'bhsd':
-        batch, nheads_q, _, head_size = q.shape
-        nheads_k = k.shape[1]
-    elif metadata.layout == 'bshd':
-        batch, _, nheads_q, head_size = q.shape
-        nheads_k = k.shape[2]
-    return batch, nheads_q, nheads_k, head_size
-
-
-def get_strides_from_layout(q, k, v, o, metadata):
-    assert metadata.layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
-
-    STRIDE_PERMUTATIONS = {
-        'thd': (None, 1, 0, 2),
-        'bhsd': (0, 1, 2, 3),
-        'bshd': (0, 2, 1, 3),
-    }
-
-    perm = STRIDE_PERMUTATIONS[metadata.layout]
-    stride = lambda x, p: (0 if p is None else x.stride(p))
-    strides = lambda x: (stride(x, p) for p in perm)
-
-    return tuple(strides(x) for x in [q, k, v, o])
+    # Now we compute whether we need to exit early due to causal masking.
+    # This is because for seqlen_q > seqlen_k, M rows of the attn scores
+    # are completely masked, resulting in 0s written to the output, and
+    # inf written to LSE. We don't need to do any GEMMs in this case.
+    # This block of code determines what N is, and if this WG is operating
+    # on those M rows.
+    n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
+    if IS_CAUSAL:
+        # If seqlen_q == seqlen_k, the attn scores are a square matrix.
+        # If seqlen_q != seqlen_k, attn scores are rectangular which means
+        # the causal mask boundary is bottom right aligned, and ends at either
+        # the top edge (seqlen_q < seqlen_k) or left edge.
+        # This captures the decrease in n_blocks if we have a rectangular attn
+        # matrix
+        n_blocks_seqlen = cdiv_fn(
+            (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N)
+        # This is what adjusts the block_max for the current WG, only
+        # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks
+        n_blocks = min(n_blocks, n_blocks_seqlen)
+        # If we have no blocks after adjusting for seqlen deltas, this WG is
+        # part of the blocks that are all 0. We exit early.
+        if n_blocks <= 0:
+            o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om +
+                        off_h_q * stride_oh)
+            O_block_ptr = tl.make_block_ptr(
+                base=Out + o_offset,
+                shape=(seqlen_q, BLOCK_DMODEL),
+                strides=(stride_om, stride_on),
+                offsets=(start_m * BLOCK_M, 0),
+                block_shape=(BLOCK_M, BLOCK_DMODEL),
+                order=(1, 0),
+            )
+            acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty)
+            # We still need to write 0s to the result
+            # tl.store(O_block_ptr,
+            # acc.to(Out.type.element_ty), boundary_check=(0,1))
+            # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q
+            #          + offs_m
+            # We store inf to LSE, not -inf because in the bwd pass,
+            # we subtract this
+            # from qk which makes it -inf, such that exp(qk - inf) = 0
+            # for these masked blocks.
+            # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32)
+            # tl.store(l_ptrs, l)
+            # TODO: Should dropout and return encoded softmax be handled here?
+            return
+
+    # If MQA / GQA, set the K and V head offsets appropriately.
+    GROUP_SIZE: tl.constexpr = HQ // HK
+    off_h_k = off_h_q // GROUP_SIZE if GROUP_SIZE != 1 else off_h_q
+
+    n_extra_tokens = 0
+    if seqlen_k < BLOCK_N:
+        n_extra_tokens = BLOCK_N - seqlen_k
+    elif seqlen_k % BLOCK_N:
+        n_extra_tokens = seqlen_k % BLOCK_N
+    padded_head = ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL
+
+    # Compute pointers for all the tensors used in this kernel.
+    q_offset = (off_z * stride_qz + off_h_q * stride_qh +
+                cu_seqlens_q_start * stride_qm)
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q + q_offset,
+        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_qm, stride_qk),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    k_offset = (off_z * stride_kz + off_h_k * stride_kh +
+                cu_seqlens_k_start * stride_kn)
+    K_block_ptr = tl.make_block_ptr(
+        base=K + k_offset,
+        shape=(ACTUAL_BLOCK_DMODEL, seqlen_k),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_DMODEL, BLOCK_N),
+        order=(0, 1),
+    )
+    v_offset = (off_z * stride_vz + off_h_k * stride_vh +
+                cu_seqlens_k_start * stride_vk)
+    V_block_ptr = tl.make_block_ptr(
+        base=V + v_offset,
+        shape=(seqlen_k, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_vk, stride_vn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_N, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    if BIAS_TYPE != 0:
+        bias_ptr = tl.make_block_ptr(
+            base=bias + off_h_q * stride_bh,
+            shape=(seqlen_q, seqlen_k),
+            strides=(stride_bm, stride_bn),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, BLOCK_N),
+            order=(1, 0),
+        )
+    else:
+        bias_ptr = None
+    if ENABLE_DROPOUT:
+        batch_philox_offset = philox_offset_base \
+                              + (off_z * HQ + off_h_q) \
+                              * seqlen_q * seqlen_k
+    else:
+        batch_philox_offset = 0
+    # We can ask to return the dropout mask without actually doing any dropout.
+    # In this case, we return an invalid pointer so indicate the mask is not i
+    # valid.
+    # TODO: Fix encoded softmax. It currently uses just h_q in the base offset.
+    if RETURN_ENCODED_SOFTMAX:
+        encoded_softmax_block_ptr = tl.make_block_ptr(
+            base=encoded_softmax + off_h_q * seqlen_q * seqlen_k,
+            shape=(seqlen_q, seqlen_k),
+            strides=(seqlen_k, 1),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, BLOCK_N),
+            order=(1, 0),
+        )
+    else:
+        encoded_softmax_block_ptr = 0
+    # initialize pointer to m and l
+    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use 2^x in the loop as we do not
+    # have native e^x support in HW.
+    qk_scale = sm_scale * 1.44269504089
+    # Q is loaded once at the beginning and shared by all N blocks.
+    q = load_fn(Q_block_ptr, True, padded_head, "zero")
+    if not USE_FP8:
+        q = (q * qk_scale).to(Q_block_ptr.type.element_ty)
+        acc_scale = 1.0
+    else:
+        qk_scale *= q_scale * k_scale
+        acc_scale = p_scale * v_scale
+
+    # Here we compute how many full and masked blocks we have.
+    padded_block_k = n_extra_tokens != 0
+    is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0)
+    if IS_CAUSAL:
+        # There are always at least BLOCK_M // BLOCK_N masked blocks.
+        # Additionally there might be one more due to dissimilar seqlens.
+        masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn)
+    else:
+        # Padding on Q does not need to be masked in the FA loop.
+        masked_blocks = padded_block_k
+    # if IS_CAUSAL, not is_modulo_mn does not always result in an additional
+    # block. In this case we might exceed n_blocks so pick the min.
+    masked_blocks = min(masked_blocks, n_blocks)
+    n_full_blocks = n_blocks - masked_blocks
+    block_min = 0
+    block_max = n_blocks * BLOCK_N
+    # Compute for full blocks. Here we set causal to false regardless of its
+    # value because there is no masking. Similarly we do not need padding.
+    if n_full_blocks > 0:
+        block_max = (n_blocks - masked_blocks) * BLOCK_N
+        acc, l_i, m_i = _attn_fwd_inner(
+            acc,
+            l_i,
+            m_i,
+            q,
+            K_block_ptr,
+            V_block_ptr,
+            start_m,
+            seqlen_k,
+            dropout_p,
+            philox_seed,
+            batch_philox_offset,
+            encoded_softmax_block_ptr,
+            # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
+            block_min,
+            block_max,
+            0,
+            0,
+            0,
+            bias_ptr,
+            # IS_CAUSAL, ....
+            False,
+            BLOCK_M,
+            BLOCK_DMODEL,
+            BLOCK_N,
+            offs_m,
+            offs_n,
+            # _, MASK_STEPS, ...
+            PRE_LOAD_V,
+            False,
+            ENABLE_DROPOUT,
+            RETURN_ENCODED_SOFTMAX,
+            padded_head,
+            USE_FP8,
+            qk_scale,
+            p_descale,
+        )
+        block_min = block_max
+        block_max = n_blocks * BLOCK_N
+
+    tl.debug_barrier()
+    # Remaining blocks, if any, are full / not masked.
+    if masked_blocks > 0:
+        offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0
+        K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N))
+        V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0))
+        if bias_ptr is not None:
+            bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N))
+        if RETURN_ENCODED_SOFTMAX:
+            encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr,
+                                                   (0, n_full_blocks))
+        acc, l_i, m_i = _attn_fwd_inner(
+            acc,
+            l_i,
+            m_i,
+            q,
+            K_block_ptr,
+            V_block_ptr,
+            start_m,
+            seqlen_k,
+            dropout_p,
+            philox_seed,
+            batch_philox_offset,
+            encoded_softmax_block_ptr,
+            block_min,
+            block_max,
+            offs_n_causal,
+            masked_blocks,
+            n_extra_tokens,
+            bias_ptr,
+            IS_CAUSAL,
+            BLOCK_M,
+            BLOCK_DMODEL,
+            BLOCK_N,
+            offs_m,
+            offs_n,
+            # _, MASK_STEPS, ...
+            PRE_LOAD_V,
+            True,
+            ENABLE_DROPOUT,
+            RETURN_ENCODED_SOFTMAX,
+            padded_head,
+            USE_FP8,
+            qk_scale,
+            p_descale,
+        )
+    # epilogue
+
+    if USE_FP8:
+        acc *= acc_scale
+    acc = acc / l_i[:, None]
+    if ENABLE_DROPOUT:
+        acc = acc / (1 - dropout_p)
+    # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M,
+    # then we have one block with a row of all NaNs which come from computing
+    # softmax over a row of all -infs (-inf - inf = NaN). We check for that here
+    # and store 0s where there are NaNs as these rows should've been zeroed out.
+    end_m_idx = (start_m + 1) * BLOCK_M
+    start_m_idx = start_m * BLOCK_M
+    causal_start_idx = seqlen_q - seqlen_k
+    if USE_FP8_OUT:
+        acc *= o_descale
+        acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
+    acc = acc.to(Out.type.element_ty)
+    if IS_CAUSAL:  # noqa: SIM102
+        if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx:
+            out_mask_boundary = tl.full((BLOCK_DMODEL, ),
+                                        causal_start_idx,
+                                        dtype=tl.int32)
+            mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
+            out_ptrs_mask = (mask_m_offsets[:, None]
+                             >= out_mask_boundary[None, :])
+            z = tl.zeros((1, ), tl.float32)
+            acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
+    # write back LSE
+    # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
+    # If seqlen_q not multiple of BLOCK_M, we need to mask out the last
+    # few rows. This is only true for the last M block. For others,
+    # overflow_size will be -ve
+    # overflow_size = end_m_idx - seqlen_q
+    # if overflow_size > 0:
+    #    boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32)
+    #    # This is a > check because mask being 0 blocks the store.
+    #    l_ptrs_mask = boundary > tl.arange(0, BLOCK_M)
+    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
+    # else:
+    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i))
+
+    # write back O
+    o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om +
+                off_h_q * stride_oh)
+    O_block_ptr = tl.make_block_ptr(
+        base=Out + o_offset,
+        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_om, stride_on),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    # Need boundary check on this to make sure the padding from the
+    # Q and KV tensors in both dims are not part of what we store back.
+    # TODO: Do the boundary check optionally.
+    tl.store(O_block_ptr, acc, boundary_check=(0, 1))
+
+
+def check_args(
+    q,
+    k,
+    v,
+    o,
+    varlen=True,
+    max_seqlens=None,
+    cu_seqlens_q=None,
+    cu_seqlens_k=None,
+):
+    assert q.dim() == k.dim() and q.dim() == v.dim()
+    if varlen:
+        assert q.dim() == 3
+        total_q, nheads_q, head_size = q.shape
+        total_k, nheads_k, _ = k.shape
+        assert cu_seqlens_q is not None
+        assert cu_seqlens_k is not None
+        assert len(cu_seqlens_q) == len(cu_seqlens_k)
+    else:
+        assert q.dim() == 4
+        batch, nheads_q, seqlen_q, head_size = q.shape
+        _, nheads_k, seqlen_k, _ = k.shape
+        assert max_seqlens > 0
+    assert k.shape == v.shape
+    assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
+    # TODO: Change assert if we support qkl f8 and v f16
+    assert q.dtype == k.dtype and q.dtype == v.dtype
+    assert head_size <= 256
+    assert o.shape == q.shape
+    assert (nheads_q % nheads_k) == 0
 
 
 class _attention(torch.autograd.Function):
 
     @staticmethod
-    def forward(ctx, q, k, v, o, metadata: MetaData):
-        # NOTE: a large bias tensor leads to overflow during pointer arithmetic
-        if (metadata.bias is not None):
-            assert (metadata.bias.numel() < 2**31)
+    def forward(
+        ctx,
+        q,
+        k,
+        v,
+        o,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlens_q,
+        max_seqlens_k,
+        causal=False,
+        sm_scale=1.0,
+        bias=None,
+        fp8_scales=None,
+        fp8_out_scale=None,
+    ):
+        if fp8_scales is not None:
+            use_fp8 = True
+            (q_scale, k_scale, v_scale, p_scale) = fp8_scales
+            float8 = current_platform.fp8_dtype()
+
+            def check_and_convert(t, scale):
+                if t.dtype != float8:
+                    descale = 1.0 / scale
+                    ts = (t * descale).clamp(min=float8_info.min,
+                                             max=float8_info.max)
+                    return ts.to(float8)
+                else:
+                    return t
 
-        if o is None:
-            if metadata.eight_bit:
-                o = torch.empty_like(
-                    q,
-                    dtype=metadata.output_dtype if metadata.output_dtype
-                    is not None else metadata.eight_bit_dtype_torch)
-            else:
-                o = torch.empty_like(q, dtype=q.dtype)
+            q = check_and_convert(q, q_scale)
+            k = check_and_convert(k, k_scale)
+            v = check_and_convert(v, v_scale)
+        else:
+            use_fp8 = False
+            q_scale = k_scale = v_scale = p_scale = 1.0
 
-        metadata.check_args(q, k, v, o)
+        if o is None:
+            o = torch.empty_like(q, dtype=v.dtype)
 
-        batch, nheads_q, nheads_k, head_size = get_shape_from_layout(
-            q, k, metadata)
-        q_strides, k_strides, v_strides, o_strides = get_strides_from_layout(
-            q, k, v, o, metadata)
+        check_args(
+            q,
+            k,
+            v,
+            o,
+            varlen=True,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+        )
+        if True:  # varlen
+            total_q, nheads_q, head_size = q.shape
+            total_k, nheads_k, _ = k.shape
+            batch = len(cu_seqlens_q) - 1
+            q_strides = (0, q.stride(1), q.stride(0), q.stride(2))
+            k_strides = (0, k.stride(1), k.stride(0), k.stride(2))
+            v_strides = (0, v.stride(1), v.stride(0), v.stride(2))
+            o_strides = (0, o.stride(1), o.stride(0), o.stride(2))
+        else:
+            batch, seqlen_q, nheads_q, head_size = q.shape
+            _, seqlen_k, nheads_k, _ = k.shape
+            q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3))
+            k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3))
+            v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3))
+            o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3))
 
         # Get closest power of 2 over or equal to 32.
-        padded_d_model = 1 << (head_size - 1).bit_length()
-        # Smallest head_dim supported is 16. If smaller, the tile in the
-        # kernel is padded - there is no padding in memory for any dims.
-        padded_d_model = max(padded_d_model, 16)
-
-        # encoded_softmax is used to validate dropout behavior vs the
-        # PyTorch SDPA math backend reference.  We zero this out to give a
-        # consistent starting point and then populate it with the output of
-        # softmax with the sign bit set according to the dropout mask.
-        # The resulting return allows this mask to be fed into the reference
-        # implementation for testing only.  This return holds no useful output
-        # aside from debugging.
-        if metadata.return_encoded_softmax:
-            encoded_softmax = torch.zeros(
-                (q.shape[0], q.shape[1], q.shape[2], k.shape[2]),
-                device=q.device,
-                dtype=torch.float32)
+        unpadded_head_dims = {32, 64, 128, 256}
+        if head_size not in unpadded_head_dims:
+            padded_d_model = None
+            for i in unpadded_head_dims:
+                if i > head_size:
+                    padded_d_model = i
+                    break
+            assert padded_d_model is not None
         else:
-            encoded_softmax = None
+            padded_d_model = head_size
+
+        grid = lambda META: (
+            triton.cdiv(max_seqlens_q, META["BLOCK_M"]),
+            nheads_q,
+            batch,
+        )
 
-        M = torch.empty((batch, nheads_q, metadata.max_seqlens_q),
-                        device=q.device,
-                        dtype=torch.float32)
+        encoded_softmax = None
 
         # Seed the RNG so we get reproducible results for testing.
         philox_seed = 0x1BF52
         philox_offset = 0x1D4B42
 
-        if metadata.bias is not None:
-            bias_strides = (metadata.bias.stride(0), metadata.bias.stride(1),
-                            metadata.bias.stride(2), metadata.bias.stride(3))
+        if bias is not None:
+            bias_strides = (
+                bias.stride(0),
+                bias.stride(1),
+                bias.stride(2),
+                bias.stride(3),
+            )
         else:
             bias_strides = (0, 0, 0, 0)
 
-        if metadata.alibi_slopes is not None:
-            alibi_strides = (metadata.alibi_slopes.stride(0),
-                             metadata.alibi_slopes.stride(1))
-        else:
-            alibi_strides = (0, 0)
+        p_descale = 1.0 / p_scale
+        o_descale = 1.0 / fp8_out_scale.item(
+        ) if fp8_out_scale is not None else 1.0
 
-        if metadata.eight_bit:
-            q_descale, k_descale, p_scale, p_descale, v_descale, o_scale = (
-                metadata.q_descale, metadata.k_descale, metadata.p_scale,
-                metadata.p_descale, metadata.v_descale, metadata.o_scale)
-            o_descale = 1.0 / o_scale if o_scale is not None else None
-        else:
-            q_descale = k_descale = p_scale = None
-            p_descale = v_descale = o_descale = None
-
-        # number of compute units available
-        NUM_CU = torch.cuda.get_device_properties("cuda").multi_processor_count
-
-        grid = lambda META: (triton.cdiv(metadata.max_seqlens_q, META[
-            'BLOCK_M']), nheads_q, batch)
+        arg_max_seqlens_q = 0 if on_gfx1x() else max_seqlens_q
+        arg_max_seqlens_k = 0 if on_gfx1x() else max_seqlens_k
 
         attn_fwd[grid](
             q,
             k,
             v,
-            metadata.bias,
-            metadata.sm_scale,
-            M,
+            bias,
+            sm_scale,
+            q_scale,
+            k_scale,
+            v_scale,
+            p_scale,
+            p_descale,
+            o_descale,
+            None,
             o,
             *q_strides,
             *k_strides,
             *v_strides,
             *o_strides,
             *bias_strides,
-            *alibi_strides,
-            q_descale,
-            k_descale,
-            p_scale,
-            p_descale,
-            o_descale,
-            v_descale,
-            q_descale.numel() == 1 if q_descale is not None else False,
-            k_descale.numel() == 1 if k_descale is not None else False,
-            p_descale.numel() == 1 if p_descale is not None else False,
-            v_descale.numel() == 1 if v_descale is not None else False,
-            metadata.cu_seqlens_q,
-            metadata.cu_seqlens_k,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            dropout_p=0.0,
             philox_seed=philox_seed,
             philox_offset_base=philox_offset,
             encoded_softmax=encoded_softmax,
-            alibi_slopes=metadata.alibi_slopes,
             HQ=nheads_q,
             HK=nheads_k,
-            IS_ACTUAL_BLOCK_DMODEL=head_size,
-            MAX_SEQLENS_Q=metadata.max_seqlens_q,
-            MAX_SEQLENS_K=metadata.max_seqlens_k,
-            IS_CAUSAL=metadata.causal,
-            VARLEN=metadata.varlen,
+            ACTUAL_BLOCK_DMODEL=head_size,
+            MAX_SEQLENS_Q=arg_max_seqlens_q,
+            MAX_SEQLENS_K=arg_max_seqlens_k,
+            IS_CAUSAL=causal,
+            VARLEN=True,
             BLOCK_DMODEL=padded_d_model,
-            USE_BIAS=metadata.bias is not None,
-            USE_ALIBI=metadata.alibi_slopes is not None,
-            SHOULD_RETURN_ENCODED_SOFTMAX=metadata.return_encoded_softmax,
-            IS_EIGHT_BIT=metadata.eight_bit,
-            USE_P_SCALE=metadata.eight_bit and metadata.use_p_scale,
-            IS_EIGHT_BIT_KV=metadata.eight_bit and metadata.eight_bit_kv,
-            NUM_CU=NUM_CU,
-            B=batch,
-            QUANT_DTYPE=metadata.eight_bit_dtype_triton)
+            BIAS_TYPE=0 if bias is None else 1,
+            ENABLE_DROPOUT=False,
+            RETURN_ENCODED_SOFTMAX=False,
+            USE_FP8=use_fp8,
+            USE_FP8_OUT=fp8_out_scale is not None,
+        )
 
         ctx.grid = grid
-        ctx.sm_scale = metadata.sm_scale
+        ctx.sm_scale = sm_scale
         ctx.BLOCK_DMODEL = head_size
-        ctx.causal = metadata.causal
-        ctx.alibi_slopes = metadata.alibi_slopes
+        ctx.causal = causal
+        ctx.dropout_p = 0.0
         ctx.philox_seed = philox_seed
         ctx.philox_offset = philox_offset
         ctx.encoded_softmax = encoded_softmax
-        ctx.return_encoded_softmax = metadata.return_encoded_softmax
+        ctx.return_encoded_softmax = False
         return o, encoded_softmax
 
 
-triton_attention_rocm = _attention.apply
-
-
-def scale_fp8(t, scale=None):
-    t_scaled, scale_out = ops.scaled_fp8_quant(t.reshape(-1, t.shape[-1]),
-                                               scale)
-    return t_scaled.reshape(t.shape), scale_out
-
-
-def maybe_quantize_fp8(t, scale):
-    eight_bit_dtype = current_platform.fp8_dtype()
-    if t.dtype != eight_bit_dtype:
-        t, _ = scale_fp8(t, scale)
-    return t
-
-
-def check_and_maybe_quantize_qkv(q, k, v, fp8_scales):
-    (q_scale, k_scale, v_scale, p_scale) = fp8_scales
-
-    q = maybe_quantize_fp8(q, q_scale)
-    k = maybe_quantize_fp8(k, k_scale)
-    v = maybe_quantize_fp8(v, v_scale)
-
-    return q, k, v
-
-
-# query   - [num_tokens, num_heads, head_size]
-# key     - [num_tokens, num_kv_heads, head_size]
-# value   - [num_tokens, num_kv_heads, head_size
-# output  - [num_tokens, num_heads, head_size]
-def triton_attention(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    o: torch.Tensor,
-    cu_seqlens_q: torch.Tensor,
-    cu_seqlens_k: torch.Tensor,
-    max_seqlens_q: int,
-    max_seqlens_k: int,
-    causal: bool = False,
-    sm_scale: float = 1.0,
-    bias: Optional[torch.Tensor] = None,
-    fp8_scales: Optional[tuple[float, ...]] = None,
-    input_scale: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    if fp8_scales is not None:
-        q_descale, k_descale, v_descale, p_scale = fp8_scales
-    else:
-        q_descale = k_descale = v_descale = p_scale = None
-
-    attn_metadata = MetaData(sm_scale=sm_scale,
-                             max_seqlens_q=max_seqlens_q,
-                             max_seqlens_k=max_seqlens_k,
-                             causal=causal,
-                             bias=bias,
-                             output_dtype=q.dtype,
-                             cu_seqlens_q=cu_seqlens_q,
-                             cu_seqlens_k=cu_seqlens_k,
-                             q_descale=q_descale,
-                             k_descale=k_descale,
-                             v_descale=v_descale,
-                             p_scale=p_scale,
-                             o_scale=input_scale)
-
-    if fp8_scales is not None:
-        q, k, v = check_and_maybe_quantize_qkv(q, k, v, fp8_scales)
-
-    return triton_attention_rocm(q, k, v, o, attn_metadata)
+triton_attention = _attention.apply
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index d544b4ab4..06ee8614d 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -98,6 +98,12 @@ def with_amdsmi_context(fn):
     return wrapper
 
 
+@cache
+def on_gfx1x() -> bool:
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    return any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"])
+
+
 @cache
 def on_mi250_mi300() -> bool:
     GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
-- 
GitLab


From c29034037d73e4f774796c661d3fc48038fc1067 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 30 May 2025 00:36:58 +0800
Subject: [PATCH 052/274] [Deprecation] Disallow pos-args other than `model`
 when initializing `LLM` (#18802)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/llm/test_init.py | 24 ------------------------
 vllm/entrypoints/llm.py            | 20 +++-----------------
 2 files changed, 3 insertions(+), 41 deletions(-)
 delete mode 100644 tests/entrypoints/llm/test_init.py

diff --git a/tests/entrypoints/llm/test_init.py b/tests/entrypoints/llm/test_init.py
deleted file mode 100644
index 925bf56a9..000000000
--- a/tests/entrypoints/llm/test_init.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-from vllm import LLM
-
-from ...utils import error_on_warning
-
-MODEL_NAME = "facebook/opt-125m"
-
-
-def test_pos_args_deprecated():
-    with error_on_warning(DeprecationWarning):
-        LLM(model=MODEL_NAME, tokenizer=MODEL_NAME)
-
-    with error_on_warning(DeprecationWarning):
-        LLM(MODEL_NAME, tokenizer=MODEL_NAME)
-
-    with pytest.warns(DeprecationWarning, match="'tokenizer'"):
-        LLM(MODEL_NAME, MODEL_NAME)
-
-    with pytest.warns(DeprecationWarning,
-                      match="'tokenizer', 'tokenizer_mode'"):
-        LLM(MODEL_NAME, MODEL_NAME, "auto")
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index f8eeae61f..e05189ef4 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -45,8 +45,7 @@ from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                                get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (Counter, Device, deprecate_args, deprecate_kwargs,
-                        is_list_of)
+from vllm.utils import Counter, Device, deprecate_kwargs, is_list_of
 
 if TYPE_CHECKING:
     from vllm.v1.metrics.reader import Metric
@@ -143,12 +142,6 @@ class LLM:
     DEPRECATE_LEGACY: ClassVar[bool] = True
     """A flag to toggle whether to deprecate the legacy generate/encode API."""
 
-    DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
-    """
-    A flag to toggle whether to deprecate positional arguments in
-    [LLM.__init__][].
-    """
-
     @classmethod
     @contextmanager
     def deprecate_legacy_api(cls):
@@ -158,16 +151,11 @@ class LLM:
 
         cls.DEPRECATE_LEGACY = False
 
-    @deprecate_args(
-        start_index=2,  # Ignore self and model
-        is_deprecated=lambda: LLM.DEPRECATE_INIT_POSARGS,
-        additional_message=(
-            "All positional arguments other than `model` will be "
-            "replaced with keyword arguments in an upcoming version."),
-    )
     def __init__(
         self,
         model: str,
+        *,
+        task: TaskOption = "auto",
         tokenizer: Optional[str] = None,
         tokenizer_mode: TokenizerMode = "auto",
         skip_tokenizer_init: bool = False,
@@ -189,8 +177,6 @@ class LLM:
         hf_token: Optional[Union[bool, str]] = None,
         hf_overrides: Optional[HfOverrides] = None,
         mm_processor_kwargs: Optional[dict[str, Any]] = None,
-        # After positional args are removed, move this right below `model`
-        task: TaskOption = "auto",
         override_pooler_config: Optional[PoolerConfig] = None,
         compilation_config: Optional[Union[int, dict[str, Any]]] = None,
         **kwargs,
-- 
GitLab


From d58f9c7f7a3bd29a6ab3f6a5a740299a4555cbc2 Mon Sep 17 00:00:00 2001
From: CYJiang <86391540+googs1025@users.noreply.github.com>
Date: Fri, 30 May 2025 01:26:07 +0800
Subject: [PATCH 053/274] [Misc] Remove duplicate init for self.vllm_config
 (#18896)

Signed-off-by: googs1025 <googs1025@gmail.com>
---
 vllm/v1/engine/core.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 740ba60fe..e6de31ab7 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -123,7 +123,6 @@ class EngineCore:
             logger.info("Batch queue is enabled with size %d",
                         self.batch_queue_size)
             self.batch_queue = queue.Queue(self.batch_queue_size)
-        self.vllm_config = vllm_config
 
     def _initialize_kv_caches(
             self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
-- 
GitLab


From 32ce3cf7c97bc309b5dbffaea5703a89969cfc1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 29 May 2025 19:54:16 +0200
Subject: [PATCH 054/274] [V1] Allocate kv_cache with stride order for V1
 (#18775)

Signed-off-by: nicklucche <nlucches@redhat.com>
---
 tests/v1/worker/test_gpu_model_runner.py | 71 +++++++++++++++++++-----
 vllm/v1/worker/gpu_model_runner.py       | 26 ++++++++-
 2 files changed, 81 insertions(+), 16 deletions(-)

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index b8c3d8861..c38eb4866 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import random
+
 import pytest
 
+from vllm.attention import Attention
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
                          SchedulerConfig, VllmConfig)
 from vllm.sampling_params import SamplingParams
@@ -13,27 +16,30 @@ from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import InputBatch
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
+BLOCK_SIZE = 16
+NUM_BLOCKS = 10
+
 
 def initialize_kv_cache(runner: GPUModelRunner):
     """
     Only perform necessary steps in GPUModelRunner.initialize_kv_cache()
     """
+    attn_spec = FullAttentionSpec(
+        block_size=BLOCK_SIZE,
+        num_kv_heads=runner.model_config.get_num_kv_heads(
+            runner.parallel_config),
+        head_size=runner.model_config.get_head_size(),
+        dtype=runner.kv_cache_dtype,
+        use_mla=False,
+    )
+    tensor_size = attn_spec.page_size_bytes * NUM_BLOCKS
     kv_cache_config = KVCacheConfig(
-        num_blocks=10,
+        num_blocks=NUM_BLOCKS,
         tensors={
-            "layer.0": KVCacheTensor(size=1024),
+            "layer.0": KVCacheTensor(size=tensor_size),
         },
         kv_cache_groups=[
-            KVCacheGroupSpec(
-                layer_names=["layer.0"],
-                kv_cache_spec=FullAttentionSpec(
-                    block_size=16,
-                    num_kv_heads=runner.model_config.get_num_kv_heads(
-                        runner.parallel_config),
-                    head_size=runner.model_config.get_head_size(),
-                    dtype=runner.kv_cache_dtype,
-                    use_mla=False,
-                ))
+            KVCacheGroupSpec(layer_names=["layer.0"], kv_cache_spec=attn_spec)
         ])
     runner.kv_cache_config = kv_cache_config
     runner.input_batch = InputBatch(
@@ -65,7 +71,7 @@ def model_runner():
         seed=42,
     )
     cache_config = CacheConfig(
-        block_size=16,
+        block_size=BLOCK_SIZE,
         gpu_memory_utilization=0.9,
         swap_space=0,
         cache_dtype="auto",
@@ -77,6 +83,10 @@ def model_runner():
         scheduler_config=scheduler_config,
         parallel_config=parallel_config,
     )
+    num_heads = model_config.get_num_kv_heads(parallel_config)
+    head_size = model_config.get_head_size()
+    vllm_config.compilation_config.static_forward_context[
+        "layer.0"] = Attention(num_heads, head_size, 0.1)
 
     device = "cuda"
     runner = GPUModelRunner(vllm_config, device)
@@ -321,3 +331,38 @@ def test_update_states_request_unscheduled(model_runner):
 
     assert _is_req_added(model_runner, req_ids[1])
     assert not _is_req_scheduled(model_runner, req_ids[1])
+
+
+def test_kv_cache_stride_order(monkeypatch, model_runner):
+    # This test checks if GPUModelRunner initializes correctly when an attention
+    # backend enforces a non-default KV cache stride order.
+    n_heads = model_runner.model_config.get_num_kv_heads(
+        model_runner.parallel_config)
+    expected_kv_cache_shape = [
+        2, NUM_BLOCKS, BLOCK_SIZE, n_heads,
+        model_runner.model_config.get_head_size()
+    ]
+    # TODO mla test
+    default_stride = list(range(5))
+    # Permutation that gets you back to expected kv shape
+    rnd_stride = tuple(random.sample(default_stride, len(default_stride)))
+
+    def rnd_stride_order():
+        return rnd_stride
+
+    # Patch the attention backend class and re-trigger the KV cache creation.
+    for attn_backend in model_runner.attn_backends:
+        monkeypatch.setattr(attn_backend, "get_kv_cache_stride_order",
+                            rnd_stride_order)
+
+    model_runner.attn_backends = []
+    model_runner.attn_metadata_builders = []
+    model_runner.initialize_kv_cache(model_runner.kv_cache_config)
+
+    # Shape is unchanged, but layout may differ
+    kv_cache_shape = model_runner.kv_caches[0].shape
+    assert list(kv_cache_shape) == expected_kv_cache_shape
+    if default_stride == rnd_stride:
+        assert all(kv.is_contiguous() for kv in model_runner.kv_caches)
+    else:
+        assert all(not kv.is_contiguous() for kv in model_runner.kv_caches)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d1195bcfb..60425a4e1 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2033,9 +2033,29 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                         num_blocks, kv_cache_spec.block_size,
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
                     dtype = kv_cache_spec.dtype
-                    kv_caches[layer_name] = torch.zeros(kv_cache_shape,
-                                                        dtype=dtype,
-                                                        device=self.device)
+                    try:
+                        kv_cache_stride_order = self.attn_backends[
+                            i].get_kv_cache_stride_order()
+                        assert len(kv_cache_stride_order) == len(
+                            kv_cache_shape)
+                    except (AttributeError, NotImplementedError):
+                        kv_cache_stride_order = tuple(
+                            range(len(kv_cache_shape)))
+                    # The allocation respects the backend-defined stride order
+                    # to ensure the semantic remains consistent for each
+                    # backend. We first obtain the generic kv cache shape and
+                    # then permute it according to the stride order which could
+                    # result in a non-contiguous tensor.
+                    kv_cache_shape = tuple(kv_cache_shape[i]
+                                           for i in kv_cache_stride_order)
+                    # Maintain original KV shape view.
+                    inv_order = [
+                        kv_cache_stride_order.index(i)
+                        for i in range(len(kv_cache_stride_order))
+                    ]
+                    kv_caches[layer_name] = torch.zeros(
+                        kv_cache_shape, dtype=dtype,
+                        device=self.device).permute(*inv_order)
                 else:
                     # TODO: add new branches when introducing more types of
                     # KV cache specs.
-- 
GitLab


From d1d61f33511d46fda02944d8000a5ab22465c142 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 29 May 2025 11:04:18 -0700
Subject: [PATCH 055/274] [BugFix] Make DP work with connector-delayed new
 requests (#18559)

Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Will Eaton <weaton@redhat.com>
---
 tests/v1/engine/test_engine_core.py | 18 +++++------
 vllm/forward_context.py             |  7 ++--
 vllm/v1/engine/core.py              | 50 +++++++++++++----------------
 vllm/v1/engine/core_client.py       |  3 +-
 4 files changed, 37 insertions(+), 41 deletions(-)

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index dcf494825..ae1d8a762 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -88,7 +88,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         assert len(engine_core.scheduler.running) == 4
 
         # Loop through until they are all done.
-        while len(engine_core.step().outputs) > 0:
+        while len(engine_core.step()[0].outputs) > 0:
             pass
 
         assert len(engine_core.scheduler.waiting) == 0
@@ -163,11 +163,11 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         req0.request_id = req1.request_id = "test"
         engine_core.add_request(req0)
 
-        while len(engine_core.step().outputs) > 0:
+        while len(engine_core.step()[0].outputs) > 0:
             pass
 
         engine_core.add_request(req1)
-        while len(engine_core.step().outputs) > 0:
+        while len(engine_core.step()[0].outputs) > 0:
             pass
 
         assert len(engine_core.scheduler.waiting) == 0
@@ -207,7 +207,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
             assert len(engine_core.scheduler.waiting) == 1
             assert len(engine_core.scheduler.running) == 0
             # Loop through until they are all done.
-            while len(engine_core.step().outputs) > 0:
+            while len(engine_core.step()[0].outputs) > 0:
                 pass
             assert len(engine_core.scheduler.waiting) == 0
             assert len(engine_core.scheduler.running) == 0
@@ -296,7 +296,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         engine_core.add_request(req1)
 
         # Schedule Batch 1: (10, req0)
-        assert engine_core.step_with_batch_queue() is None
+        assert engine_core.step_with_batch_queue()[0] is None
         assert engine_core.batch_queue.qsize() == 1
         scheduler_output = engine_core.batch_queue.queue[-1][1]
         assert scheduler_output.num_scheduled_tokens[0] == 10
@@ -305,7 +305,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
             req0.request_id].num_computed_tokens == 10
 
         # Schedule Batch 2: (2, req0), (8, req1)
-        assert engine_core.step_with_batch_queue() is None
+        assert engine_core.step_with_batch_queue()[0] is None
         assert engine_core.batch_queue.qsize() == 2
         scheduler_output = engine_core.batch_queue.queue[-1][1]
         assert scheduler_output.num_scheduled_tokens[0] == 2
@@ -327,7 +327,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         assert scheduler_output.num_scheduled_tokens[1] == 4
 
         # Batch queue is full. Finish Batch 2. Get first token of req0.
-        output = engine_core.step_with_batch_queue()
+        output = engine_core.step_with_batch_queue()[0]
         assert output is not None
         assert len(output.outputs) == 1
         assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
@@ -339,7 +339,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         assert scheduler_output.num_scheduled_tokens[0] == 1
 
         # Batch queue is full. Finish Batch 3. Get first token of req1.
-        output = engine_core.step_with_batch_queue()
+        output = engine_core.step_with_batch_queue()[0]
         assert output is not None
         assert len(output.outputs) == 1
         assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
@@ -358,7 +358,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
             engine_core.scheduler.requests[1].num_tokens + 1,
         ]
         while engine_core.scheduler.get_num_unfinished_requests() == 2:
-            output = engine_core.step_with_batch_queue()
+            output = engine_core.step_with_batch_queue()[0]
             if step % 2 == 0:
                 # Even steps consumes an output.
                 assert output is not None
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 592ca650a..0af16bbc0 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -101,7 +101,7 @@ def get_forward_context() -> ForwardContext:
 def set_forward_context(attn_metadata: Any,
                         vllm_config: VllmConfig,
                         virtual_engine: int = 0,
-                        num_tokens: int = 0):
+                        num_tokens: Optional[int] = None):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
@@ -111,9 +111,10 @@ def set_forward_context(attn_metadata: Any,
     if need_to_track_batchsize:
         forward_start_time = time.perf_counter()
     dp_metadata: Optional[DPMetadata] = None
-    if vllm_config.parallel_config.data_parallel_size > 1:
+    if vllm_config.parallel_config.data_parallel_size > 1 and (
+            attn_metadata is not None or num_tokens is not None):
         dp_metadata = DPMetadata.make(vllm_config.parallel_config,
-                                      attn_metadata, num_tokens)
+                                      attn_metadata, num_tokens or 0)
 
     global _forward_context
     prev_context = _forward_context
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e6de31ab7..2a7680140 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -211,8 +211,12 @@ class EngineCore:
             # Re-raise exception
             raise err
 
-    def step(self) -> EngineCoreOutputs:
-        """Schedule, execute, and make output."""
+    def step(self) -> tuple[EngineCoreOutputs, bool]:
+        """Schedule, execute, and make output.
+
+        Returns tuple of outputs and a flag indicating whether the model
+        was executed.
+        """
 
         # Check for any requests remaining in the scheduler - unfinished,
         # or finished and not yet removed from the batch.
@@ -220,15 +224,17 @@ class EngineCore:
             return EngineCoreOutputs(
                 outputs=[],
                 scheduler_stats=self.scheduler.make_stats(),
-            )
+            ), False
         scheduler_output = self.scheduler.schedule()
         model_output = self.execute_model(scheduler_output)
         engine_core_outputs = self.scheduler.update_from_output(
             scheduler_output, model_output)  # type: ignore
 
-        return engine_core_outputs
+        return (engine_core_outputs,
+                scheduler_output.total_num_scheduled_tokens > 0)
 
-    def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]:
+    def step_with_batch_queue(
+            self) -> tuple[Optional[EngineCoreOutputs], bool]:
         """Schedule and execute batches with the batch queue.
         Note that if nothing to output in this step, None is returned.
 
@@ -273,7 +279,7 @@ class EngineCore:
             engine_core_outputs = self.scheduler.update_from_output(
                 scheduler_output, model_output)
 
-        return engine_core_outputs
+        return engine_core_outputs, scheduled_batch
 
     def shutdown(self):
         self.structured_output_manager.clear_backend()
@@ -537,15 +543,17 @@ class EngineCoreProc(EngineCore):
             req = self.input_queue.get_nowait()
             self._handle_client_request(*req)
 
-    def _process_engine_step(self):
+    def _process_engine_step(self) -> bool:
         """Called only when there are unfinished local requests."""
 
         # Step the engine core.
-        outputs = self.step_fn()
+        outputs, model_executed = self.step_fn()
         # Put EngineCoreOutputs into the output queue.
         if outputs is not None:
             self.output_queue.put_nowait(outputs)
 
+        return model_executed
+
     def _handle_client_request(self, request_type: EngineCoreRequestType,
                                request: Any) -> None:
         """Dispatch request from client."""
@@ -749,30 +757,16 @@ class DPEngineCoreProc(EngineCoreProc):
             # 1) Poll the input queue until there is work to do.
             self._process_input_queue()
 
+            # 2) Step the engine core.
+            executed = self._process_engine_step()
             local_unfinished_reqs = self.scheduler.has_unfinished_requests()
-
-            if local_unfinished_reqs:
-                # 2) Step the engine core.
-                self._process_engine_step()
-
-                # Check if we have now finished all requests.
-                local_unfinished_reqs = (
-                    self.scheduler.has_unfinished_requests())
-            else:
-                if self.scheduler.has_finished_requests():
-                    # There are no unfinished requests, but there are some
-                    # finished requests remaining to be removed from the
-                    # batch state. This engine step won't perform a forward
-                    # pass but will flush the finished requests to ensure
-                    # up-to-date state is returned in the engine outputs.
-                    self._process_engine_step()
-
-                if not self.engines_running:
+            if not executed:
+                if not local_unfinished_reqs and not self.engines_running:
                     # All engines are idle.
                     continue
 
-                # There must be unfinished requests in DP peers, run a
-                # dummy forward pass.
+                # We are in a running state and so must execute a dummy pass
+                # if the model didn't execute any ready requests.
                 self.execute_dummy_batch()
 
             # 3) All-reduce operation to determine global unfinished reqs.
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 0d52bc9a6..9f8a9b692 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -206,7 +206,8 @@ class InprocClient(EngineCoreClient):
         self.engine_core = EngineCore(*args, **kwargs)
 
     def get_output(self) -> EngineCoreOutputs:
-        return self.engine_core.step()
+        outputs, _ = self.engine_core.step()
+        return outputs
 
     def add_request(self, request: EngineCoreRequest) -> None:
         self.engine_core.add_request(request)
-- 
GitLab


From 64eaf5fe0589a9e9082ce1de6a2e9d22ccb84b6a Mon Sep 17 00:00:00 2001
From: Will Eaton <wseaton@users.noreply.github.com>
Date: Thu, 29 May 2025 14:08:40 -0400
Subject: [PATCH 056/274] [P/D] NixlConnector DP fixes (#18903)

Signed-off-by: Will Eaton <weaton@redhat.com>
---
 vllm/distributed/kv_transfer/kv_connector/factory.py |  3 ++-
 .../kv_transfer/kv_connector/v1/nixl_connector.py    | 12 +++++++-----
 vllm/v1/engine/core.py                               |  9 +++++++++
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 06b3983ed..dce0b545c 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -70,7 +70,8 @@ class KVConnectorFactory:
             connector_module = importlib.import_module(connector_module_path)
             connector_cls = getattr(connector_module, connector_name)
         assert issubclass(connector_cls, KVConnectorBase_V1)
-        logger.info("Creating v1 connector with name: %s", connector_name)
+        logger.info("Creating v1 connector with name: %s and engine_id: %s",
+                    connector_name, kv_transfer_config.engine_id)
         # NOTE(Kuntai): v1 connector is explicitly separated into two roles.
         # Scheduler connector:
         # - Co-locate with scheduler process
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 6303d77ad..f02434aeb 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -19,7 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size,
-    get_tp_group)
+    get_tp_group, get_world_group)
 from vllm.logger import init_logger
 from vllm.utils import make_zmq_path, make_zmq_socket, round_down
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -334,6 +334,7 @@ class NixlConnectorWorker:
         self.engine_id = engine_id
         self.rank = get_tensor_model_parallel_rank()
         self.world_size = get_tensor_model_parallel_world_size()
+        self.world_rank = get_world_group().rank_in_group
         self.tp_group = get_tp_group()
 
         # KV Caches and nixl tracking data.
@@ -382,7 +383,8 @@ class NixlConnectorWorker:
 
     @staticmethod
     def _nixl_handshake_listener(metadata: NixlAgentMetadata,
-                                 ready_event: threading.Event, rank: int):
+                                 ready_event: threading.Event,
+                                 world_rank: int):
         """Background thread for getting new NIXL handshakes."""
         # NOTE(rob): this is a simple implementation. We will move
         # to a better approach like an ETCD server in the future.
@@ -403,7 +405,7 @@ class NixlConnectorWorker:
         # NOTE(rob): we need each rank to have a unique port. This
         # hack to keeps us moving. We will switch when moving to etcd
         # or where we have a single ZMQ socket in the scheduler.
-        port = envs.VLLM_NIXL_SIDE_CHANNEL_PORT + rank
+        port = envs.VLLM_NIXL_SIDE_CHANNEL_PORT + world_rank
         path = make_zmq_path("tcp", host, port)
         logger.debug("Starting listening on path: %s", path)
         with zmq_ctx(zmq.ROUTER, path) as sock:
@@ -422,7 +424,7 @@ class NixlConnectorWorker:
         # NOTE(rob): we need each rank to have a unique port. This is
         # a hack to keep us moving. We will switch when moving to etcd
         # or where we have a single ZMQ socket in the scheduler.
-        path = make_zmq_path("tcp", host, port + self.rank)
+        path = make_zmq_path("tcp", host, port + self.world_rank)
         logger.debug("Querying metadata on path: %s", path)
         with zmq_ctx(zmq.REQ, path) as sock:
             # Send query for the request.
@@ -529,7 +531,7 @@ class NixlConnectorWorker:
         ready_event = threading.Event()
         self._nixl_handshake_listener_t = threading.Thread(
             target=self._nixl_handshake_listener,
-            args=(metadata, ready_event, self.rank),
+            args=(metadata, ready_event, self.world_rank),
             daemon=True,
             name="nixl_handshake_listener")
         self._nixl_handshake_listener_t.start()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 2a7680140..ed71d9b67 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -707,6 +707,15 @@ class DPEngineCoreProc(EngineCoreProc):
         assert dp_size > 1
         assert 0 <= local_dp_rank <= dp_rank < dp_size
 
+        if vllm_config.kv_transfer_config is not None:
+            # modify the engine_id and append the local_dp_rank to it to ensure
+            # that the kv_transfer_config is unique for each DP rank.
+            vllm_config.kv_transfer_config.engine_id = (
+                f"{vllm_config.kv_transfer_config.engine_id}_dp{local_dp_rank}"
+            )
+            logger.debug("Setting kv_transfer_config.engine_id to %s",
+                         vllm_config.kv_transfer_config.engine_id)
+
         from vllm.platforms import current_platform
         device_control_env_var = current_platform.device_control_env_var
         world_size = vllm_config.parallel_config.world_size
-- 
GitLab


From a521ef06e5eb18a34d665282fa38c4768a855bb8 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Thu, 29 May 2025 18:41:58 -0400
Subject: [PATCH 057/274] Use standalone_compile by default in torch >= 2.8.0
 (#18846)

Signed-off-by: rzou <zou3519@gmail.com>
---
 vllm/compilation/backends.py           |  5 +++--
 vllm/compilation/compiler_interface.py |  2 +-
 vllm/envs.py                           | 14 ++++++++------
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 0358c9d0d..b724479a9 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -16,7 +16,7 @@ import vllm.envs as envs
 from vllm.config import CompilationConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import resolve_obj_by_qualname
+from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname
 
 from .compiler_interface import (CompilerInterface, EagerAdaptor,
                                  InductorAdaptor, InductorStandaloneAdaptor)
@@ -29,7 +29,8 @@ logger = init_logger(__name__)
 
 def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
     if compilation_config.use_inductor:
-        if envs.VLLM_TEST_STANDALONE_COMPILE:
+        if envs.VLLM_USE_STANDALONE_COMPILE and is_torch_equal_or_newer(
+                "2.8.0"):
             logger.info("Using InductorStandaloneAdaptor")
             return InductorStandaloneAdaptor()
         else:
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 7e9186f86..8fa8ce279 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -155,7 +155,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
     This is not on by default yet, but we plan to turn it on by default for
     PyTorch 2.8.
 
-    Use VLLM_TEST_STANDALONE_COMPILE to toggle this on or off.
+    Use VLLM_USE_STANDALONE_COMPILE to toggle this on or off.
     """
     name = "inductor_standalone"
 
diff --git a/vllm/envs.py b/vllm/envs.py
index bd9104afa..785fe7309 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -143,10 +143,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
 
 def get_vllm_port() -> Optional[int]:
     """Get the port from VLLM_PORT environment variable.
-    
+
     Returns:
         The port number as an integer if VLLM_PORT is set, None otherwise.
-        
+
     Raises:
         ValueError: If VLLM_PORT is a URI, suggest k8s service discovery issue.
     """
@@ -308,9 +308,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: bool(
         os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
 
-    # Internal flag to enable/disable Inductor standalone compile
-    "VLLM_TEST_STANDALONE_COMPILE":
-    lambda: os.environ.get("VLLM_TEST_STANDALONE_COMPILE", "0") != "0",
+    # Feature flag to enable/disable Inductor standalone compile.
+    # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
+    # enabled by default.
+    "VLLM_USE_STANDALONE_COMPILE":
+    lambda: os.environ.get("VLLM_USE_STANDALONE_COMPILE", "1") == "1",
 
     # local rank of the process in the distributed setting, used to determine
     # the GPU device id
@@ -892,7 +894,7 @@ def compute_hash() -> str:
         "VLLM_USE_TRITON_AWQ",
         "VLLM_DP_RANK",
         "VLLM_DP_SIZE",
-        "VLLM_TEST_STANDALONE_COMPILE",
+        "VLLM_USE_STANDALONE_COMPILE",
     ]
     for key in environment_variables_to_hash:
         if key in environment_variables:
-- 
GitLab


From a1cc9f33a32eef4550daccdc76aefc1baf7bc35d Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Thu, 29 May 2025 16:00:11 -0700
Subject: [PATCH 058/274] [TPU] remove transpose ops in moe kernel (#18923)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 requirements/tpu.txt                               | 10 +++++-----
 tests/tpu/test_moe_pallas.py                       |  2 +-
 vllm/model_executor/layers/fused_moe/moe_pallas.py |  9 ++-------
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 3b204a8f9..edc8b2a45 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -18,9 +18,9 @@ setuptools==78.1.0
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.8.0.dev20250518
-torchvision==0.22.0.dev20250518
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch==2.8.0.dev20250529
+torchvision==0.22.0.dev20250529
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
 
diff --git a/tests/tpu/test_moe_pallas.py b/tests/tpu/test_moe_pallas.py
index 13fc8bc8f..19df22f78 100644
--- a/tests/tpu/test_moe_pallas.py
+++ b/tests/tpu/test_moe_pallas.py
@@ -26,7 +26,7 @@ TOP_KS = [2, 6]
 # The Pallas GMM kernel requires num_tokens * topk to be a multiple of 16
 @pytest.mark.parametrize("m", [8, 16, 64, 2048])
 @pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("k", [128, 512, 1024])
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("ep_size", EP_SIZE)
diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
index 539459992..9d8bd62c6 100644
--- a/vllm/model_executor/layers/fused_moe/moe_pallas.py
+++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py
@@ -67,15 +67,10 @@ def fused_moe(
     token_indices = token_indices[topk_argsort_indices]
     group_sizes = _histogram(topk_indices.to(torch.int32), 0, num_experts - 1)
 
-    # NOTE(woosuk): The GMM Pallas kernel requires a different weight layout
-    # from HF Transformers.
-    w1 = w1.transpose(1, 2)
-    w2 = w2.transpose(1, 2)
-
     x = hidden_states[token_indices]
-    x = torch.ops.xla.gmm(x, w1, group_sizes)
+    x = torch.ops.xla.gmm(x, w1, group_sizes, transpose_rhs=True)
     x = F.silu(x[..., :intermediate_size]) * x[..., intermediate_size:]
-    x = torch.ops.xla.gmm(x, w2, group_sizes)
+    x = torch.ops.xla.gmm(x, w2, group_sizes, transpose_rhs=True)
     x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size)
 
     x = x * topk_weights.unsqueeze(dim=-1)
-- 
GitLab


From d54af615d5cf63e111e564b3a2c25405fc16b4fd Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 29 May 2025 22:13:17 -0400
Subject: [PATCH 059/274] [Bugfix] Fix PP default fallback behavior for V1
 (#18915)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/engine/arg_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 2a1a34211..13d8a280e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1380,7 +1380,8 @@ class EngineArgs:
 
         if (self.pipeline_parallel_size > 1
                 and self.distributed_executor_backend
-                not in ("ray", "mp", "external_launcher")):
+                not in (ParallelConfig.distributed_executor_backend, "ray",
+                        "mp", "external_launcher")):
             name = "Pipeline Parallelism without Ray distributed executor " \
                     "or multiprocessing executor or external launcher"
             _raise_or_fallback(feature_name=name, recommend_to_remove=False)
-- 
GitLab


From 1aa2f81b43b286ec9fa66439c151d18f20a662ec Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 30 May 2025 10:17:01 +0800
Subject: [PATCH 060/274] [Misc] Update type annotation for rotary embedding
 `base` (#18914)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 benchmarks/kernels/benchmark_rope.py          |  2 +-
 tests/kernels/core/test_pos_encoding.py       |  6 ++--
 .../model_executor/layers/rotary_embedding.py | 34 +++++++++----------
 vllm/model_executor/models/minimax_text_01.py |  7 ++--
 4 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 110d36db1..944024ca3 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -22,7 +22,7 @@ def benchmark_rope_kernels_multi_lora(
     seed: int,
     device: str,
     max_position: int = 8192,
-    base: int = 10000,
+    base: float = 10000,
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
index f327deb0e..8cb56314c 100644
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -70,7 +70,7 @@ def test_rotary_embedding(
     device: str,
     use_key: bool,
     max_position: int = 8192,
-    base: int = 10000,
+    base: float = 10000,
 ) -> None:
     if rotary_dim is None:
         rotary_dim = head_size
@@ -135,7 +135,7 @@ def test_batched_rotary_embedding(
     device: str,
     use_key: bool,
     max_position: int = 8192,
-    base: int = 10000,
+    base: float = 10000,
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
@@ -203,7 +203,7 @@ def test_batched_rotary_embedding_multi_lora(
     device: str,
     use_key: bool,
     max_position: int = 8192,
-    base: int = 10000,
+    base: float = 10000,
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 70463ecd9..afc059719 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -96,7 +96,7 @@ class RotaryEmbedding(CustomOp):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         dtype: torch.dtype,
     ) -> None:
@@ -113,7 +113,7 @@ class RotaryEmbedding(CustomOp):
         self.cos_sin_cache: torch.Tensor
         self.register_buffer("cos_sin_cache", cache, persistent=False)
 
-    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
         """Compute the inverse frequency."""
         # NOTE(woosuk): To exactly match the HF implementation, we need to
         # use CPU to compute the cache and then move it to GPU. However, we
@@ -404,7 +404,7 @@ class LinearScalingRotaryEmbedding(RotaryEmbedding):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         scaling_factors: Union[list[float], float],
         dtype: torch.dtype,
@@ -464,7 +464,7 @@ class NTKScalingRotaryEmbedding(RotaryEmbedding):
                  head_size: int,
                  rotary_dim: int,
                  max_position_embeddings: int,
-                 base: int,
+                 base: float,
                  is_neox_style: bool,
                  scaling_factor: float,
                  dtype: torch.dtype,
@@ -474,7 +474,7 @@ class NTKScalingRotaryEmbedding(RotaryEmbedding):
         super().__init__(head_size, rotary_dim, max_position_embeddings, base,
                          is_neox_style, dtype)
 
-    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
         base = self.base * (self.scaling_factor if self.mixed_b is None else 1)
         inv_freq = super()._compute_inv_freq(base)
 
@@ -501,7 +501,7 @@ class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         scaling_factor: float,
         dtype: torch.dtype,
@@ -582,7 +582,7 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         scaling_factor: float,
         dtype: torch.dtype,
@@ -644,7 +644,7 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         rotary_dim: int,
         max_position_embeddings: int,
         original_max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         dtype: torch.dtype,
         short_factor: list[float],
@@ -769,7 +769,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         scaling_factor: float,
         dtype: torch.dtype,
@@ -877,7 +877,7 @@ class Llama3RotaryEmbedding(RotaryEmbedding):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         dtype: torch.dtype,
         scaling_factor: float,
@@ -892,7 +892,7 @@ class Llama3RotaryEmbedding(RotaryEmbedding):
         super().__init__(head_size, rotary_dim, max_position_embeddings, base,
                          is_neox_style, dtype)
 
-    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
         inv_freqs = super()._compute_inv_freq(base)
         low_freq_wavelen = self.orig_max_position / self.low_freq_factor
         high_freq_wavelen = self.orig_max_position / self.high_freq_factor
@@ -923,14 +923,14 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         dtype: torch.dtype,
     ):
         super().__init__(head_size, rotary_dim, max_position_embeddings, base,
                          is_neox_style, dtype)
 
-    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
         inv_freqs = super()._compute_inv_freq(base)
         inv_freqs = inv_freqs[:(self.rotary_dim // 2)]
         return inv_freqs
@@ -989,7 +989,7 @@ class MRotaryEmbedding(RotaryEmbedding):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         dtype: torch.dtype,
         mrope_section: Optional[list[int]] = None,
@@ -1529,7 +1529,7 @@ class DualChunkRotaryEmbedding(CustomOp):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         dtype: torch.dtype,
         chunk_size: int,
@@ -1558,7 +1558,7 @@ class DualChunkRotaryEmbedding(CustomOp):
                              q_inter_cache,
                              persistent=False)
 
-    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
         """Compute the inverse frequency."""
         # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`.
         # However, we use `torch.arange(..., dtype=torch.float)` instead to
@@ -1705,7 +1705,7 @@ def get_rope(
     head_size: int,
     rotary_dim: int,
     max_position: int,
-    base: int,
+    base: float,
     is_neox_style: bool = True,
     rope_scaling: Optional[dict[str, Any]] = None,
     dtype: Optional[torch.dtype] = None,
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 36bab9ee1..ac0fe7b10 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -141,7 +141,7 @@ class MiniMaxText01RotaryEmbedding(CustomOp):
         head_size: int,
         rotary_dim: int,
         max_position: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         cache_dtype: torch.dtype,
     ) -> None:
@@ -155,10 +155,7 @@ class MiniMaxText01RotaryEmbedding(CustomOp):
         cache = self._compute_cos_sin_cache().to(cache_dtype)
         self.register_buffer("cos_sin_cache", cache, persistent=False)
 
-    def _compute_inv_freq(
-        self,
-        base: Union[int, float],
-    ) -> torch.Tensor:
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
         """Compute the inverse frequency."""
         inv_freq = 1.0 / (base**(torch.arange(
             0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
-- 
GitLab


From 3132290a14a66dc73c9f15ec9cd9f8909c978e11 Mon Sep 17 00:00:00 2001
From: Carol Zheng <cazheng@google.com>
Date: Fri, 30 May 2025 02:24:19 +0000
Subject: [PATCH 061/274] [TPU][CI/CD] Clean up docker for TPU tests. (#18926)

Signed-off-by: Carol Zheng <cazheng@google.com>
---
 .../scripts/hardware_ci/run-tpu-v1-test.sh    | 39 +++++++++++++++++--
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 5dd53420d..610243145 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -2,15 +2,46 @@
 
 set -xu
 
-# Build the docker image.
-docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 
-# Set up cleanup.
-remove_docker_container() { docker rm -f tpu-test || true; }
+remove_docker_container() { 
+    docker rm -f tpu-test || true; 
+    docker rm -f vllm-tpu || true;
+}
+
 trap remove_docker_container EXIT
+
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 
+# Build the docker image.
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+cleanup_docker() {
+  # Get Docker's root directory
+  docker_root=$(docker info -f '{{.DockerRootDir}}')
+  if [ -z "$docker_root" ]; then
+    echo "Failed to determine Docker root directory."
+    exit 1
+  fi
+  echo "Docker root directory: $docker_root"
+  # Check disk usage of the filesystem where Docker's root directory is located
+  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
+  threshold=70
+  if [ "$disk_usage" -gt "$threshold" ]; then
+    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
+    docker image prune -f
+    # Remove unused volumes / force the system prune for old images as well.
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
+    echo "Docker images and volumes cleanup completed."
+  else
+    echo "Disk usage is below $threshold%. No cleanup needed."
+  fi
+}
+cleanup_docker
+
 # For HF_TOKEN.
 source /etc/environment
 
-- 
GitLab


From 3de3eadf5b1c271ccd7140526ffb3f850d6b0189 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 30 May 2025 10:24:47 +0800
Subject: [PATCH 062/274] improve the robustness of parsing vlms config in
 AutoRound (#18894)

Signed-off-by: wenhuach21 <wenhua.cheng@intel.com>
---
 vllm/model_executor/layers/quantization/auto_round.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index 2d9f5e52b..eb8ffa378 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -116,8 +116,9 @@ class AutoRoundConfig(QuantizationConfig):
 
         quantized = True
         if self.block_name_to_quantize:
-            quantized = any(name in layer_name
-                            for name in self.block_name_to_quantize)
+            quantized = any(
+                layer_name.startswith(name)
+                for name in self.block_name_to_quantize)
         elif isinstance(layer, ParallelLMHead):
             quantized = False
 
-- 
GitLab


From 77164dad5e7973163d1563f0854644c2864700be Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Fri, 30 May 2025 12:44:43 +0800
Subject: [PATCH 063/274] [Bugfix] Consistent ascii handling in tool parsers
 (#18883)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/entrypoints/openai/serving_chat.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 6a0e3b14d..ea8e187dc 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -988,7 +988,8 @@ class OpenAIServingChat(OpenAIServing):
                     tool_calls=[
                         tool_call_class(function=FunctionCall(
                             name=tool_call.name,
-                            arguments=json.dumps(tool_call.parameters)))
+                            arguments=json.dumps(tool_call.parameters,
+                                                 ensure_ascii=False)))
                         for tool_call in tool_calls
                     ])
 
-- 
GitLab


From 3987e2ae963963b9edb132935deabd16dd5a7468 Mon Sep 17 00:00:00 2001
From: iLeGend <youzhi.jin@intel.com>
Date: Fri, 30 May 2025 12:50:10 +0800
Subject: [PATCH 064/274] [Model] Use AutoWeightsLoader for mamba2 (#18918)

Signed-off-by: iLeGend <824040212@qq.com>
---
 vllm/model_executor/models/mamba2.py | 43 ++++++++++++++++------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index 858a1633b..65c6467bc 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -32,7 +32,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType
 
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -167,6 +167,27 @@ class Mamba2Model(nn.Module):
 
         return hidden_states
 
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree,
                         SupportsV0Only):
@@ -282,21 +303,5 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree,
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "A_log" in name:
-                name = name.replace("A_log", "A")
-
-            # Skip loading extra bias for GPTQ models.
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
-- 
GitLab


From 5acf828d9932fb109f18b9790d0bca011a9ed868 Mon Sep 17 00:00:00 2001
From: H <linhaibin.eric@gmail.com>
Date: Thu, 29 May 2025 22:20:48 -0700
Subject: [PATCH 065/274] [docs] fix: fix markdown syntax (#18927)

---
 docs/design/arch_overview.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/design/arch_overview.md b/docs/design/arch_overview.md
index 75d3e1b7c..14720a392 100644
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@@ -48,8 +48,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-More API details can be found in the [Offline Inference]
-(#offline-inference-api) section of the API docs.
+More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
 
 The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
 
-- 
GitLab


From 77b6e74fe2b6259b2ad4dbca55eca50017a68926 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Fri, 30 May 2025 13:33:17 +0800
Subject: [PATCH 066/274] [ROCm] Remove unnecessary assertion of max_model_len
 in ROCM_AITER_MLA attention backend. (#18938)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/attention/backends/rocm_aiter_mla.py        | 2 --
 vllm/v1/attention/backends/mla/rocm_aiter_mla.py | 3 ---
 2 files changed, 5 deletions(-)

diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py
index b04822002..c974f2a15 100644
--- a/vllm/attention/backends/rocm_aiter_mla.py
+++ b/vllm/attention/backends/rocm_aiter_mla.py
@@ -132,8 +132,6 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
 
     def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         super().__init__(input_builder)
-        assert self.runner.model_config.max_model_len == 32768,\
-                "AITER MLA requires max model len to be set to 32768"
         assert self.block_size == 1, "AITER MLA requires only block size 1."
 
     def prepare(self):
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 31980e94a..d1e823bbe 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -66,9 +66,6 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
     def __init__(self, runner, kv_cache_spec: AttentionSpec,
                  block_table: BlockTable):
         super().__init__(runner, kv_cache_spec, block_table)
-        max_model_len = self.runner.model_config.max_model_len
-        assert max_model_len == 32768,\
-            "AITER MLA requires max_model_len=32768"
         assert self.kv_cache_spec.block_size == 1, "AITER MLA" \
             "only supports block size 1."
 
-- 
GitLab


From 4d0a1541bed0a89f71c55b0714e18422688bec53 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 30 May 2025 01:37:36 -0400
Subject: [PATCH 067/274] [Bugfix] Remove NVFP4 scales assertions to fix
 load_format=dummy (#18861)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/layers/quantization/modelopt.py       | 8 +++++---
 .../layers/quantization/utils/marlin_utils_fp4.py         | 7 ++++++-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 1c5680f95..2abe16a08 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -585,9 +585,11 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         # GEMM 1
-        assert torch.allclose(
-            layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1]), (
-                "w1_weight_scale_2 must match w3_weight_scale_2")
+        if not torch.allclose(layer.w13_weight_scale_2[:, 0],
+                              layer.w13_weight_scale_2[:, 1]):
+            logger.warning_once(
+                "w1_weight_scale_2 must match w3_weight_scale_2. "
+                "Accuracy may be affected.")
 
         w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0]
         layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2,
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index 15177af58..13dcdc00a 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -22,7 +22,12 @@ def is_fp4_marlin_supported():
 
 
 def fp4_marlin_process_scales(marlin_scales):
-    assert (marlin_scales >= 0).all()
+    if not (marlin_scales >= 0).all():
+        logger.warning_once(
+            "NVFP4 Marlin assumes the scales to be >=0, but has encountered "
+            "negative scales. Accuracy will likely be degraded. This is "
+            "because it changes the scales from FP8-S1E4M3 to a special "
+            "FP8-S0E5M3 format to speedup the dequantization.")
 
     # convert to half first, we would convert to fp8 later
     marlin_scales = marlin_scales.to(torch.half)
-- 
GitLab


From 4f4a6b844a6347a539b9dd87c39f557d91981988 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 30 May 2025 14:53:37 +0800
Subject: [PATCH 068/274] [Deprecation] Remove mean pooling default for
 `Qwen2EmbeddingModel` (#18913)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/models/supported_models.md        |  2 +-
 vllm/model_executor/models/qwen2.py    | 77 +-------------------------
 vllm/model_executor/models/registry.py |  2 +-
 3 files changed, 5 insertions(+), 76 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 0202ba5a6..6b0ceaf21 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -401,7 +401,7 @@ Specified using `--task embed`.
 
 !!! note
     `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
-    You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
+    You need to manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
 
 !!! note
     For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 0d0d98c59..a664864ff 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -34,32 +34,27 @@ from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
-                    extract_layer_index, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
-logger = init_logger(__name__)
-
 
 class Qwen2MLP(nn.Module):
 
@@ -499,69 +494,3 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                            if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
-
-
-class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-
-    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
-        pooler_config = vllm_config.model_config.pooler_config
-
-        self.config = config
-        self.lora_config = lora_config
-
-        self.quant_config = quant_config
-        self.model = Qwen2Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-
-        # TODO: Replace this model class with as_embedding_model(
-        # Qwen2ForCausalLM) after changing the default pooling method
-        if pooler_config.pooling_type is None:
-            logger.warning(
-                "This embedding model will default to last-token pooling in "
-                "an upcoming version. To avoid breaking changes, you should "
-                "pass `--override-pooler-config '{\"pooling_type\": \"MEAN\"}'`"
-                " explicitly.")
-
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.MEAN,
-            normalize=True,
-            softmax=False)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> torch.Tensor:
-        return self.model(input_ids, positions, intermediate_tensors)
-
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-        weights = self.hf_to_vllm_mapper.apply(weights)
-        weights = ((name, data) for name, data in weights
-                   if not name.startswith("lm_head."))
-        self.model.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 97ea12de6..5a6a12fcc 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -142,7 +142,7 @@ _EMBEDDING_MODELS = {
     "ModernBertModel": ("modernbert", "ModernBertModel"),
     "NomicBertModel": ("bert_with_rope", "NomicBertModel"),
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
-    "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"),
+    "Qwen2Model": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
     "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"),
-- 
GitLab


From 6acb7a62855a037a9cb50344e692ca23ed8782ea Mon Sep 17 00:00:00 2001
From: Rabi Mishra <ramishra@redhat.com>
Date: Fri, 30 May 2025 13:28:04 +0530
Subject: [PATCH 069/274] [Misc]Fix benchmarks/README.md for speculative
 decoding (#18897)

Signed-off-by: rabi <ramishra@redhat.com>
---
 benchmarks/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index ecab570bb..cbf2f281b 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -146,9 +146,9 @@ python3 vllm/benchmarks/benchmark_serving.py \
 
 ``` bash
 VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
-    --ngram_prompt_lookup_min 2 \
-    --ngram-prompt-lookup-max 5 \
-    --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
+    --speculative-config $'{"method": "ngram",
+    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+    "prompt_lookup_min": 2}'
 ```
 
 ``` bash
@@ -273,9 +273,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \
     --output-len=100 \
     --num-prompts=2048 \
     --async-engine \
-    --ngram_prompt_lookup_min=2 \
-    --ngram-prompt-lookup-max=5 \
-    --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
+    --speculative-config $'{"method": "ngram",
+    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+    "prompt_lookup_min": 2}'
 ```
 
 ```
-- 
GitLab


From 8f8900cee9121572d27463b7d8223fa6ef76e0ef Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Fri, 30 May 2025 15:58:44 +0800
Subject: [PATCH 070/274] [doc] add mkdocs doc (#18930)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/contributing/README.md | 53 ++++++++++++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 3 deletions(-)

diff --git a/docs/contributing/README.md b/docs/contributing/README.md
index 2517436af..72c4909b1 100644
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -29,20 +29,67 @@ See <gh-file:LICENSE>.
 Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
 Check out the [building from source][build-from-source] documentation for details.
 
-### Building the docs
+### Building the docs with MkDocs
 
-Install the dependencies:
+#### Introduction to MkDocs
+
+[MkDocs](https://github.com/mkdocs/mkdocs) is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file.
+
+#### Install MkDocs and Plugins
+
+Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies:
 
 ```bash
 pip install -r requirements/docs.txt
 ```
 
-Start the autoreloading MkDocs server:
+> **Note:** Ensure that your Python version is compatible with the plugins (e.g., mkdocs-awesome-nav requires Python 3.10+)
+
+#### Verify Installation
+
+Confirm that MkDocs is correctly installed::
+
+```bash
+mkdocs --version
+```
+
+Example output:
+
+```console
+mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.9/site-packages/mkdocs (Python 3.9)
+```
+
+#### Clone the `vLLM` repository
+
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+```
+
+#### Start the Development Server
+
+MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. Make sure you're in the same directory as the `mkdocs.yml` configuration file, and then start the server by running the `mkdocs serve` command:
 
 ```bash
 mkdocs serve
 ```
 
+Example output:
+
+```console
+INFO    -  Documentation built in 106.83 seconds
+INFO    -  [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml'
+INFO    -  [22:02:02] Serving on http://127.0.0.1:8000/
+```
+
+#### View in Your Browser
+
+Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:.
+
+#### Learn More
+
+For additional features and advanced configurations, refer to the official [MkDocs Documentation](https://www.mkdocs.org/).
+
 ## Testing
 
 ```bash
-- 
GitLab


From c3bb9f23315f8cdc5007349717fd58d16a833ff6 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Fri, 30 May 2025 10:12:59 +0100
Subject: [PATCH 071/274] [Model] Use in-place adds in SigLIP (#18922)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 vllm/model_executor/models/siglip.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 3b5334afa..4803da295 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -130,11 +130,10 @@ class SiglipVisionEmbeddings(nn.Module):
         embeddings = patch_embeds.flatten(2).transpose(1, 2)
 
         if interpolate_pos_encoding:
-            embeddings = embeddings + self.interpolate_pos_encoding(
+            embeddings += self.interpolate_pos_encoding(
                 embeddings, height, width)
         else:
-            embeddings = embeddings + self.position_embedding(
-                self.position_ids)
+            embeddings += self.position_embedding(self.position_ids)
         return embeddings
 
 
@@ -271,12 +270,12 @@ class SiglipEncoderLayer(nn.Module):
 
         hidden_states = self.layer_norm1(hidden_states)
         hidden_states, _ = self.self_attn(hidden_states=hidden_states)
-        hidden_states = residual + hidden_states
+        hidden_states += residual
 
         residual = hidden_states
         hidden_states = self.layer_norm2(hidden_states)
         hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
+        hidden_states += residual
 
         return hidden_states, None
 
@@ -354,7 +353,8 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
 
         residual = hidden_state
         hidden_state = self.layernorm(hidden_state)
-        hidden_state = residual + self.mlp(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        hidden_state += residual
 
         return hidden_state[:, 0]
 
-- 
GitLab


From 5f1d0c8118b099ca0a08435f5f0312aa50f21142 Mon Sep 17 00:00:00 2001
From: Rabi Mishra <ramishra@redhat.com>
Date: Fri, 30 May 2025 14:43:47 +0530
Subject: [PATCH 072/274] [Bugfix][Failing Test] Fix test_vllm_port.py (#18618)

Signed-off-by: rabi <ramishra@redhat.com>
---
 .buildkite/test-pipeline.yaml |  3 ++-
 vllm/envs.py                  | 18 +++++++-----------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 4e7bea25e..46785a8b3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -199,8 +199,9 @@ steps:
   - tests/test_sequence
   - tests/test_config
   - tests/test_logger
+  - tests/test_vllm_port
   commands:
-  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
   # OOM in the CI unless we run this separately
   - pytest -v -s tokenization
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 785fe7309..dc52bbd8e 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -159,17 +159,13 @@ def get_vllm_port() -> Optional[int]:
         return int(port)
     except ValueError as err:
         from urllib.parse import urlparse
-        try:
-            parsed = urlparse(port)
-            if parsed.scheme:
-                raise ValueError(
-                    f"VLLM_PORT '{port}' appears to be a URI. "
-                    "This may be caused by a Kubernetes service discovery issue"
-                    "check the warning in: https://docs.vllm.ai/en/stable/usage/env_vars.html"
-                )
-        except Exception:
-            pass
-
+        parsed = urlparse(port)
+        if parsed.scheme:
+            raise ValueError(
+                f"VLLM_PORT '{port}' appears to be a URI. "
+                "This may be caused by a Kubernetes service discovery issue,"
+                "check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html"
+            ) from None
         raise ValueError(
             f"VLLM_PORT '{port}' must be a valid integer") from err
 
-- 
GitLab


From 4577fc9abb064d74b2082ffc5005cbb82ca91766 Mon Sep 17 00:00:00 2001
From: Always-Naive <97138029+Always-Naive@users.noreply.github.com>
Date: Fri, 30 May 2025 17:21:35 +0800
Subject: [PATCH 073/274] [Misc]Fix typo (#18947)

---
 vllm/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index c1213d463..25e34446a 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -107,7 +107,7 @@ STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP = (
     "currently not supported for encoder/decoder "
     "models.")
 
-STR_NOT_IMPL_ENC_DEC_LORA = ("LoRA is currently not currently "
+STR_NOT_IMPL_ENC_DEC_LORA = ("LoRA is not currently "
                              "supported with encoder/decoder "
                              "models.")
 
-- 
GitLab


From fba02e3bd1775f28cb18390a9485460a9aabdeec Mon Sep 17 00:00:00 2001
From: Carol Zheng <cazheng@google.com>
Date: Fri, 30 May 2025 10:04:03 +0000
Subject: [PATCH 074/274] [Bugfix][TPU] Fix tpu model runner testcase failure
 (#18810)

Signed-off-by: Carol Zheng <cazheng@google.com>
---
 tests/v1/tpu/worker/test_tpu_model_runner.py | 31 ++++++++++++++---
 vllm/v1/worker/tpu_model_runner.py           | 35 ++++++++++++++------
 2 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 319b38b4c..348f12887 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -81,7 +81,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
                 mm_hashes=[],
                 mm_positions=[],
                 sampling_params=SamplingParams(),
-                block_ids=[0],
+                block_ids=[[0]],  # block_ids should be list[list[int]]
                 num_computed_tokens=0,
                 lora_request=None,
             ))
@@ -112,14 +112,35 @@ def _is_req_added(model_runner, req_id: str) -> bool:
 
 
 def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
+    """Check if the request state block IDs match the block table.
+    
+    This function handles both legacy BlockTable and new MultiGroupBlockTable
+    structures for backward compatibility.
+    """
+
     req_index = model_runner.input_batch.req_id_to_index[req_id]
-    block_table = model_runner.input_batch.block_table
+    multi_group_block_table = model_runner.input_batch.block_table
     req_state = model_runner.requests[req_id]
-    if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids):
+
+    # Access the first block table from MultiGroupBlockTable
+    # This is safe since we currently only use single KV cache groups
+    block_table = multi_group_block_table[0]
+
+    # req_state.block_ids is now list[list[int]] for MultiGroupBlockTable
+    # Extract the first group's block IDs
+    if isinstance(req_state.block_ids[0], list):
+        # New format: list[list[int]] - extract first group
+        req_block_ids = req_state.block_ids[0]
+    else:
+        # Legacy format: list[int] - use directly
+        req_block_ids = req_state.block_ids
+
+    if block_table.num_blocks_per_row[req_index] != len(req_block_ids):
         return False
+
     num_blocks = block_table.num_blocks_per_row[req_index]
-    return (block_table.block_table_np[req_index, :num_blocks] ==
-            req_state.block_ids).all()
+    block_table_values = block_table.block_table_np[req_index, :num_blocks]
+    return (block_table_values == req_block_ids).all()
 
 
 def test_update_states_new_request(model_runner):
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 669908cb5..c57ac3138 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -175,11 +175,21 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         self.kv_caches: list[torch.Tensor] = []
         # req_id -> (input_id -> encoder_output)
         self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
-        # self.input_batch: InputBatch  # Persistent batch.
 
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
 
+        # Initialize input batch early to avoid AttributeError in _update_states
+        self.input_batch = InputBatch(
+            max_num_reqs=self.max_num_reqs,
+            max_model_len=self.max_model_len,
+            max_num_batched_tokens=self.max_num_tokens,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            vocab_size=self.model_config.get_vocab_size(),
+            block_size=self.block_size,
+        )
+
         # Cached torch/numpy tensor
         # The pytorch tensor and numpy array share the same buffer.
         # Sometimes the numpy op is faster so we create both.
@@ -1286,16 +1296,19 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                 "Hybrid models with more than one KV cache type are not "
                 "supported yet.")
 
-        self.input_batch = InputBatch(
-            max_num_reqs=self.max_num_reqs,
-            max_model_len=self.max_model_len,
-            max_num_batched_tokens=self.max_num_tokens,
-            device=self.device,
-            pin_memory=self.pin_memory,
-            vocab_size=self.model_config.get_vocab_size(),
-            block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec.
-            block_size,
-        )
+        if kv_cache_config.kv_cache_groups[
+                0].kv_cache_spec.block_size != self.block_size:
+            self.input_batch = InputBatch(
+                max_num_reqs=self.max_num_reqs,
+                max_model_len=self.max_model_len,
+                max_num_batched_tokens=self.max_num_tokens,
+                device=self.device,
+                pin_memory=self.pin_memory,
+                vocab_size=self.model_config.get_vocab_size(),
+                block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec.
+                block_size,
+            )
+        # Verify dtype compatibility between block_table_cpu and input_batch
         assert self.block_table_cpu.dtype == self.input_batch.block_table[
             0].get_cpu_tensor().dtype
 
-- 
GitLab


From 43ff405b9059d2991604fb35e23bf212f8f390f3 Mon Sep 17 00:00:00 2001
From: Daniele <36171005+dtrifiro@users.noreply.github.com>
Date: Fri, 30 May 2025 13:02:50 +0200
Subject: [PATCH 075/274] [CI/Build] remove regex from build dependencies
 (#18945)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 pyproject.toml                | 1 -
 setup.py                      | 2 +-
 tools/enforce_regex_import.py | 3 +++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 5286724b5..10f5dbeae 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,6 @@ requires = [
     "setuptools-scm>=8.0",
     "torch == 2.7.0",
     "wheel",
-    "regex",
     "jinja2",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/setup.py b/setup.py
index b822a4ec3..c190864dd 100644
--- a/setup.py
+++ b/setup.py
@@ -5,12 +5,12 @@ import importlib.util
 import json
 import logging
 import os
+import re
 import subprocess
 import sys
 from pathlib import Path
 from shutil import which
 
-import regex as re
 import torch
 from packaging.version import Version, parse
 from setuptools import Extension, setup
diff --git a/tools/enforce_regex_import.py b/tools/enforce_regex_import.py
index b55c4a94e..6c201dd25 100644
--- a/tools/enforce_regex_import.py
+++ b/tools/enforce_regex_import.py
@@ -58,6 +58,9 @@ def main() -> int:
         if not Path(filepath).exists():
             continue
 
+        if filepath == "setup.py":
+            continue
+
         violations = check_file(filepath)
         if violations:
             print(f"\n❌ {filepath}:")
-- 
GitLab


From e1fadf11976ecc87ed60c9fc1bf8f81271c68c18 Mon Sep 17 00:00:00 2001
From: Shawn Huang <57223022+huangyuxiang03@users.noreply.github.com>
Date: Fri, 30 May 2025 21:45:56 +0800
Subject: [PATCH 076/274] [Feature] minicpm eagle support (#18943)

Signed-off-by: huangyuxiang03 <huangyx0321@gmail.com>
Co-authored-by: huangyuxiang03 <huangyx0321@gmail.com>
---
 tests/models/registry.py                    |   5 +
 vllm/model_executor/models/minicpm.py       |   5 +-
 vllm/model_executor/models/minicpm_eagle.py | 390 ++++++++++++++++++++
 vllm/model_executor/models/registry.py      |   1 +
 4 files changed, 399 insertions(+), 2 deletions(-)
 create mode 100644 vllm/model_executor/models/minicpm_eagle.py

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 18342b671..fe49d2427 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -434,6 +434,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                             trust_remote_code=True,
                                             speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
                                             tokenizer="meta-llama/Llama-3.1-8B-Instruct"),
+    "EagleMiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-1B-sft-bf16",
+                                            trust_remote_code=True,
+                                            is_available_online=False,
+                                            speculative_model="openbmb/MiniCPM-2B-sft-bf16",
+                                            tokenizer="openbmb/MiniCPM-2B-sft-bf16"),
     "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
                                     trust_remote_code=True,
                                     speculative_model="XiaomiMiMo/MiMo-7B-RL")
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 0397b552c..f471a86ff 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -242,6 +242,7 @@ class MiniCPMAttention(nn.Module):
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
+
         self.attn = Attention(self.num_heads,
                               self.head_dim,
                               self.scaling,
@@ -444,6 +445,7 @@ class MiniCPMModel(nn.Module):
             for weight_name in ["w1", "w2", "w3"]
         ]
         params_dict = dict(self.named_parameters())
+
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
@@ -567,7 +569,7 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+                                   inputs_embeds) / self.scale_width
         return hidden_states
 
     def compute_logits(
@@ -575,7 +577,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[torch.Tensor]:
-        hidden_states = hidden_states / self.scale_width
         logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
new file mode 100644
index 000000000..039c3d22d
--- /dev/null
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -0,0 +1,390 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only EagleMiniCPM model compatible with HuggingFace weights."""
+import math
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .minicpm import MiniCPMAttention as EagleMiniCPMAttention
+from .minicpm import MiniCPMMLP as EagleMiniCPMMLP
+from .minicpm import MiniCPMMoE as EagleMiniCPMMoE
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, maybe_prefix)
+
+
+class EagleMiniCPMDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+        self.hidden_size = config.hidden_size
+        self.rope_theta = getattr(config, "rope_theta", 10000)
+        self.rope_scaling = getattr(config, "rope_scaling", None)
+        self.max_position_embeddings = getattr(config,
+                                               "max_position_embeddings", 8192)
+        self.prefix = prefix
+        self._init_attn_block()
+        self._init_ffn_block()
+
+    def _init_attn_block(self):
+        self.input_layernorm = RMSNorm(self.config.hidden_size,
+                                       eps=self.config.rms_norm_eps)
+        self.self_attn = EagleMiniCPMAttention(
+            hidden_size=self.hidden_size,
+            num_heads=self.config.num_attention_heads,
+            num_kv_heads=self.config.num_key_value_heads,
+            rope_theta=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+            max_position_embeddings=self.max_position_embeddings,
+            cache_config=self.cache_config,
+            quant_config=self.quant_config,
+            prefix=f"{self.prefix}.self_attn",
+        )
+
+    def _init_ffn_block(self):
+        self.post_attention_layernorm = RMSNorm(self.config.hidden_size,
+                                                eps=self.config.rms_norm_eps)
+        self.num_experts = getattr(self.config, "num_experts", 0)
+        if self.num_experts == 0:
+            self.mlp = EagleMiniCPMMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=self.config.intermediate_size,
+                hidden_act=self.config.hidden_act,
+                hidden_act_param=getattr(self.config, "hidden_act_param", 0.),
+                quant_config=self.quant_config,
+            )
+        else:
+            self.mlp = EagleMiniCPMMoE(
+                num_experts=self.config.num_experts,
+                top_k=self.config.num_experts_per_tok,
+                hidden_size=self.config.hidden_size,
+                intermediate_size=self.config.intermediate_size)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = residual + hidden_states * \
+            (self.config.scale_depth / math.sqrt(self.config.mup_denominator))
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states * \
+            (self.config.scale_depth / math.sqrt(self.config.mup_denominator))
+
+        return hidden_states, None
+
+
+@support_torch_compile
+class EagleMiniCPMModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 start_layer: int = 0):
+        super().__init__()
+
+        config = vllm_config.speculative_config.draft_model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        self.fc = torch.nn.Linear(self.config.hidden_size * 2,
+                                  self.config.hidden_size,
+                                  bias=False)
+        self.input_norm1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.input_norm2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+        self.num_experts = getattr(self.config, "num_experts", 0)
+        self._init_layers(prefix, config, cache_config, quant_config,
+                          start_layer)
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], self.config.hidden_size))
+
+    def _init_layers(
+        self,
+        prefix: str,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig],
+        quant_config: Optional[QuantizationConfig],
+        start_layer: int,
+    ):
+        self.eagle_layers = nn.ModuleList([
+            EagleMiniCPMDecoderLayer(
+                config,
+                cache_config,
+                quant_config,
+                f"{prefix}.eagle_layers.{i + start_layer}",
+            ) for i in range(self.config.num_hidden_layers)
+        ])
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        embedding = self.embed_tokens(input_ids)
+        return embedding * self.config.scale_emb
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        input_embeds = self.get_input_embeddings(input_ids)
+        input_embeds = self.input_norm1(input_embeds)
+        hidden_states = self.input_norm2(hidden_states)
+
+        hidden_states = self.fc(
+            torch.cat((input_embeds, hidden_states), dim=-1))
+        residual = None
+        for layer in self.eagle_layers:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+
+        return hidden_states, hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        expert_params_mapping = [
+            # (param_name, weight_name, expert_id)
+            ("ws" if weight_name in ["w1", "w3"] else "w2s",
+             f"experts.{expert_id}.{weight_name}.weight", expert_id)
+            for expert_id in range(self.num_experts)
+            for weight_name in ["w1", "w2", "w3"]
+        ]
+        params_dict = dict(self.named_parameters())
+
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for param_name, weight_name, expert_id in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  weight_name,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.speculative_config.draft_model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.prefix = prefix
+        self.vllm_config = vllm_config
+        self.config = config
+        self.lora_config = lora_config
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+
+        target_layer_num = vllm_config.model_config.get_num_layers(
+            vllm_config.parallel_config)
+
+        self.model = self._init_model(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"),
+                                      start_layer=target_layer_num)
+
+        unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+        self.scale_width = self.config.hidden_size / self.config.dim_model_base
+
+        self.logits_processor = LogitsProcessor(unpadded_vocab_size,
+                                                config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def _init_model(self,
+                    *,
+                    vllm_config: VllmConfig,
+                    prefix: str = "",
+                    start_layer: int = 0):
+        return EagleMiniCPMModel(vllm_config=vllm_config,
+                                 prefix=prefix,
+                                 start_layer=start_layer)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        hidden_states, hidden_states2 = self.model(input_ids, positions,
+                                                   hidden_states)
+        hidden_states = hidden_states / self.scale_width
+        hidden_states2 = hidden_states2 / self.scale_width
+        return hidden_states, hidden_states2
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 5a6a12fcc..8efd4825b 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -223,6 +223,7 @@ _SPECULATIVE_DECODING_MODELS = {
     "MiMoMTPModel": ("mimo_mtp", "MiMoMTP"),
     "EAGLEModel": ("eagle", "EAGLE"),
     "EagleLlamaForCausalLM": ("llama_eagle", "EagleLlamaForCausalLM"),
+    "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"),
     "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "MedusaModel": ("medusa", "Medusa"),
-- 
GitLab


From ec6833c5e9f12edf9f08d4d2c226ac43e8089e7a Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Fri, 30 May 2025 21:45:59 +0800
Subject: [PATCH 077/274] [doc] show the count for fork and watch (#18950)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index 57b1d03de..0c6aff5fa 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -12,8 +12,8 @@
 <p style="text-align:center">
 <script async defer src="https://buttons.github.io/buttons.js"></script>
 <a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
-<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
-<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
+<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-show-count="true" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
+<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-show-count="true" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
 </p>
 
 vLLM is a fast and easy-to-use library for LLM inference and serving.
-- 
GitLab


From b29ca5c4d51ca0129485ed9546d4f94a6a8763a9 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 30 May 2025 10:37:27 -0400
Subject: [PATCH 078/274] [Docs] Update SECURITY.md with link to our security
 guide (#18961)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 SECURITY.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/SECURITY.md b/SECURITY.md
index 47196a1f1..6053cfb41 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -8,4 +8,6 @@ Please report security issues privately using [the vulnerability submission form
 
 ---
 
+Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
+
 Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
-- 
GitLab


From 84ec470fca4f3c1873863086a420a40206f1bf01 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Fri, 30 May 2025 11:00:54 -0400
Subject: [PATCH 079/274] Improve "failed to get the hash of the compiled
 graph" error (#18956)

Signed-off-by: rzou <zou3519@gmail.com>
---
 vllm/compilation/compiler_interface.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 8fa8ce279..9293610cc 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -415,8 +415,14 @@ class InductorAdaptor(CompilerInterface):
         # compilation cache. So turn off the checks if we disable the
         # compilation cache.
         if not envs.VLLM_DISABLE_COMPILE_CACHE:
-            assert hash_str is not None, (
-                "failed to get the hash of the compiled graph")
+            if hash_str is None:
+                raise RuntimeError(
+                    "vLLM failed to compile the model. The most "
+                    "likely reason for this is that a previous compilation "
+                    "failed, leading to a corrupted compilation artifact. "
+                    "We recommend trying to "
+                    "remove ~/.cache/vllm/torch_compile_cache and try again "
+                    "to see the real issue. ")
             assert file_path is not None, (
                 "failed to get the file path of the compiled graph")
         return compiled_graph, (hash_str, file_path)
-- 
GitLab


From 2dbe8c07744cd5b7531c191a734a613f8b797e65 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 30 May 2025 08:17:00 -0700
Subject: [PATCH 080/274] [Perf] API-server scaleout with many-to-many
 server-engine comms  (#17546)

---
 .buildkite/test-pipeline.yaml                 |   2 +
 .../test_api_server_process_manager.py        | 268 ++++++++++
 tests/utils.py                                |   5 +-
 tests/v1/core/test_kv_cache_utils.py          |   1 -
 tests/v1/core/test_prefix_caching.py          |   1 -
 tests/v1/core/test_scheduler.py               |   9 +-
 tests/v1/engine/test_engine_core.py           |  14 +-
 .../openai/test_multi_api_servers.py          | 171 +++++++
 .../unit/test_remote_decode_lifecycle.py      |   4 +-
 .../unit/test_remote_prefill_lifecycle.py     |   6 +-
 tests/v1/kv_connector/unit/utils.py           |   1 -
 vllm/entrypoints/cli/serve.py                 | 179 ++++++-
 vllm/entrypoints/openai/api_server.py         | 102 ++--
 vllm/lora/worker_manager.py                   |   5 +
 vllm/utils.py                                 |   6 +-
 vllm/v1/core/sched/interface.py               |  10 +-
 vllm/v1/core/sched/scheduler.py               |  53 +-
 vllm/v1/engine/__init__.py                    |   8 +-
 vllm/v1/engine/async_llm.py                   |  14 +-
 vllm/v1/engine/coordinator.py                 | 252 ++++++++++
 vllm/v1/engine/core.py                        | 253 ++++++----
 vllm/v1/engine/core_client.py                 | 456 ++++++++++--------
 vllm/v1/metrics/loggers.py                    |  79 +--
 vllm/v1/metrics/prometheus.py                 |  77 +++
 vllm/v1/request.py                            |   5 +-
 vllm/v1/utils.py                              | 301 +++++++++++-
 26 files changed, 1837 insertions(+), 445 deletions(-)
 create mode 100644 tests/entrypoints/test_api_server_process_manager.py
 create mode 100644 tests/v1/entrypoints/openai/test_multi_api_servers.py
 create mode 100644 vllm/v1/engine/coordinator.py
 create mode 100644 vllm/v1/metrics/prometheus.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 46785a8b3..bff2f69c1 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -618,9 +618,11 @@ steps:
   - vllm/worker/model_runner.py
   - entrypoints/llm/test_collective_rpc.py
   - tests/v1/test_async_llm_dp.py
+  - tests/v1/entrypoints/openai/test_multi_api_servers.py
   - vllm/v1/engine/
   commands:
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py
new file mode 100644
index 000000000..0dd1fdd99
--- /dev/null
+++ b/tests/entrypoints/test_api_server_process_manager.py
@@ -0,0 +1,268 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import multiprocessing
+import socket
+import threading
+import time
+from typing import Optional
+from unittest.mock import patch
+
+import pytest
+
+from vllm.v1.utils import (APIServerProcessManager,
+                           wait_for_completion_or_failure)
+
+# Global variables to control worker behavior
+WORKER_RUNTIME_SECONDS = 0.5
+
+
+# Mock implementation of run_api_server_worker
+def mock_run_api_server_worker(listen_address, sock, args, client_config=None):
+    """Mock run_api_server_worker that runs for a specific time."""
+    print(f"Mock worker started with client_config: {client_config}")
+    time.sleep(WORKER_RUNTIME_SECONDS)
+    print("Mock worker completed successfully")
+
+
+@pytest.fixture
+def api_server_args():
+    """Fixture to provide arguments for APIServerProcessManager."""
+    sock = socket.socket()
+    return {
+        "target_server_fn":
+        mock_run_api_server_worker,
+        "listen_address":
+        "localhost:8000",
+        "sock":
+        sock,
+        "args":
+        "test_args",  # Simple string to avoid pickling issues
+        "num_servers":
+        3,
+        "input_addresses": [
+            "tcp://127.0.0.1:5001", "tcp://127.0.0.1:5002",
+            "tcp://127.0.0.1:5003"
+        ],
+        "output_addresses": [
+            "tcp://127.0.0.1:6001", "tcp://127.0.0.1:6002",
+            "tcp://127.0.0.1:6003"
+        ],
+        "stats_update_address":
+        "tcp://127.0.0.1:7000",
+    }
+
+
+@pytest.mark.parametrize("with_stats_update", [True, False])
+def test_api_server_process_manager_init(api_server_args, with_stats_update):
+    """Test initializing the APIServerProcessManager."""
+    # Set the worker runtime to ensure tests complete in reasonable time
+    global WORKER_RUNTIME_SECONDS
+    WORKER_RUNTIME_SECONDS = 0.5
+
+    # Copy the args to avoid mutating the
+    args = api_server_args.copy()
+
+    if not with_stats_update:
+        args.pop("stats_update_address")
+    manager = APIServerProcessManager(**args)
+
+    try:
+        # Verify the manager was initialized correctly
+        assert len(manager.processes) == 3
+
+        # Verify all processes are running
+        for proc in manager.processes:
+            assert proc.is_alive()
+
+        print("Waiting for processes to run...")
+        time.sleep(WORKER_RUNTIME_SECONDS / 2)
+
+        # They should still be alive at this point
+        for proc in manager.processes:
+            assert proc.is_alive()
+
+    finally:
+        # Always clean up the processes
+        print("Cleaning up processes...")
+        manager.close()
+
+        # Give processes time to terminate
+        time.sleep(0.2)
+
+        # Verify all processes were terminated
+        for proc in manager.processes:
+            assert not proc.is_alive()
+
+
+@patch("vllm.entrypoints.cli.serve.run_api_server_worker",
+       mock_run_api_server_worker)
+def test_wait_for_completion_or_failure(api_server_args):
+    """Test that wait_for_completion_or_failure works with failures."""
+    global WORKER_RUNTIME_SECONDS
+    WORKER_RUNTIME_SECONDS = 1.0
+
+    # Create the manager
+    manager = APIServerProcessManager(**api_server_args)
+
+    try:
+        assert len(manager.processes) == 3
+
+        # Create a result capture for the thread
+        result: dict[str, Optional[Exception]] = {"exception": None}
+
+        def run_with_exception_capture():
+            try:
+                wait_for_completion_or_failure(api_server_manager=manager)
+            except Exception as e:
+                result["exception"] = e
+
+        # Start a thread to run wait_for_completion_or_failure
+        wait_thread = threading.Thread(target=run_with_exception_capture,
+                                       daemon=True)
+        wait_thread.start()
+
+        # Let all processes run for a short time
+        time.sleep(0.2)
+
+        # All processes should still be running
+        assert all(proc.is_alive() for proc in manager.processes)
+
+        # Now simulate a process failure
+        print("Simulating process failure...")
+        manager.processes[0].terminate()
+
+        # Wait for the wait_for_completion_or_failure
+        # to detect and handle the failure
+        # This should trigger it to terminate all other processes
+        wait_thread.join(timeout=1.0)
+
+        # The wait thread should have exited
+        assert not wait_thread.is_alive()
+
+        # Verify that an exception was raised with appropriate error message
+        assert result["exception"] is not None
+        assert "died with exit code" in str(result["exception"])
+
+        # All processes should now be terminated
+        for i, proc in enumerate(manager.processes):
+            assert not proc.is_alive(), f"Process {i} should not be alive"
+
+    finally:
+        manager.close()
+        time.sleep(0.2)
+
+
+@pytest.mark.timeout(30)
+def test_normal_completion(api_server_args):
+    """Test that wait_for_completion_or_failure works in normal completion."""
+    global WORKER_RUNTIME_SECONDS
+    WORKER_RUNTIME_SECONDS = 0.1
+
+    # Create the manager
+    manager = APIServerProcessManager(**api_server_args)
+
+    try:
+        # Give processes time to terminate
+        # wait for processes to complete
+        remaining_processes = manager.processes.copy()
+        while remaining_processes:
+            for proc in remaining_processes:
+                if not proc.is_alive():
+                    remaining_processes.remove(proc)
+            time.sleep(0.1)
+
+        # Verify all processes have terminated
+        for i, proc in enumerate(manager.processes):
+            assert not proc.is_alive(
+            ), f"Process {i} still alive after terminate()"
+
+        # Now call wait_for_completion_or_failure
+        # since all processes have already
+        # terminated, it should return immediately
+        # with no error
+        wait_for_completion_or_failure(api_server_manager=manager)
+
+    finally:
+        # Clean up just in case
+        manager.close()
+        time.sleep(0.2)
+
+
+@pytest.mark.timeout(30)
+def test_external_process_monitoring(api_server_args):
+    """Test that wait_for_completion_or_failure handles additional processes."""
+    global WORKER_RUNTIME_SECONDS
+    WORKER_RUNTIME_SECONDS = 100
+
+    # Create and start the external process
+    # (simulates local_engine_manager or coordinator)
+    spawn_context = multiprocessing.get_context("spawn")
+    external_proc = spawn_context.Process(target=mock_run_api_server_worker,
+                                          name="MockExternalProcess")
+    external_proc.start()
+
+    # Create the class to simulate a coordinator
+    class MockCoordinator:
+
+        def __init__(self, proc):
+            self.proc = proc
+
+        def close(self):
+            if self.proc.is_alive():
+                self.proc.terminate()
+                self.proc.join(timeout=0.5)
+
+    # Create a mock coordinator with the external process
+    mock_coordinator = MockCoordinator(external_proc)
+
+    # Create the API server manager
+    manager = APIServerProcessManager(**api_server_args)
+
+    try:
+        # Verify manager initialization
+        assert len(manager.processes) == 3
+
+        # Create a result capture for the thread
+        result: dict[str, Optional[Exception]] = {"exception": None}
+
+        def run_with_exception_capture():
+            try:
+                wait_for_completion_or_failure(api_server_manager=manager,
+                                               coordinator=mock_coordinator)
+            except Exception as e:
+                result["exception"] = e
+
+        # Start a thread to run wait_for_completion_or_failure
+        wait_thread = threading.Thread(target=run_with_exception_capture,
+                                       daemon=True)
+        wait_thread.start()
+
+        # Terminate the external process to trigger a failure
+        time.sleep(0.2)
+        external_proc.terminate()
+
+        # Wait for the thread to detect the failure
+        wait_thread.join(timeout=1.0)
+
+        # The wait thread should have completed
+        assert not wait_thread.is_alive(
+        ), "wait_for_completion_or_failure thread still running"
+
+        # Verify that an exception was raised with appropriate error message
+        assert result["exception"] is not None, "No exception was raised"
+        error_message = str(result["exception"])
+        assert "died with exit code" in error_message, \
+            f"Unexpected error message: {error_message}"
+        assert "MockExternalProcess" in error_message, \
+            f"Error doesn't mention external process: {error_message}"
+
+        # Verify that all API server processes were terminated as a result
+        for i, proc in enumerate(manager.processes):
+            assert not proc.is_alive(
+            ), f"API server process {i} was not terminated"
+
+    finally:
+        # Clean up
+        manager.close()
+        mock_coordinator.close()
+        time.sleep(0.2)
diff --git a/tests/utils.py b/tests/utils.py
index bf38d7843..d21b18470 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -28,7 +28,7 @@ from tests.models.utils import TextTextLogprobs
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.entrypoints.cli.serve import ServeSubcommand
 from vllm.model_executor.model_loader import get_model_loader
 from vllm.platforms import current_platform
 from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -99,7 +99,8 @@ class RemoteOpenAIServer:
 
         parser = FlexibleArgumentParser(
             description="vLLM's remote OpenAI server.")
-        parser = make_arg_parser(parser)
+        subparsers = parser.add_subparsers(required=False, dest="subparser")
+        parser = ServeSubcommand().subparser_init(subparsers)
         args = parser.parse_args(["--model", model, *vllm_serve_args])
         self.host = str(args.host or 'localhost')
         self.port = int(args.port)
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 43a27da2d..d3d62cf09 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -45,7 +45,6 @@ def make_request(request_id,
         multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17),
         eos_token_id=100,
-        arrival_time=0,
         lora_request=None,
         cache_salt=cache_salt,
     )
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 3da27786b..ba3c0b3cf 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -38,7 +38,6 @@ def make_request(request_id,
         sampling_params=SamplingParams(max_tokens=17,
                                        prompt_logprobs=prompt_logprobs),
         eos_token_id=100,
-        arrival_time=0,
         lora_request=None,
         cache_salt=cache_salt,
     )
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index f40d477a0..f38454b1b 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -138,7 +138,6 @@ def create_requests(num_requests: int,
             multi_modal_placeholders=mm_position,
             multi_modal_hashes=None,
             eos_token_id=EOS_TOKEN_ID,
-            arrival_time=0,
         )
         requests.append(request)
     return requests
@@ -744,7 +743,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
         assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i])
 
     # No draft or accepted tokens counted yet
-    assert engine_core_outputs.scheduler_stats.spec_decoding_stats is None
+    assert not engine_core_outputs or (
+        engine_core_outputs[0].scheduler_stats.spec_decoding_stats is None)
 
     # Schedule the speculated tokens for validation
     output = scheduler.schedule()
@@ -772,7 +772,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     engine_core_outputs = scheduler.update_from_output(output,
                                                        model_runner_output)
 
-    scheduler_stats = engine_core_outputs.scheduler_stats
+    scheduler_stats = engine_core_outputs[0].scheduler_stats \
+        if engine_core_outputs else None
     if expected[0] == 0:
         assert scheduler_stats.spec_decoding_stats is None
     else:
@@ -843,7 +844,7 @@ def _step_until_done(
             # We should be in the decode phase now.
             assert num_scheduled_tokens == 1
         assert len(output.kv_connector_metadata.requests) == 0
-        ecos = scheduler.update_from_output(output, model_runner_output)
+        ecos = scheduler.update_from_output(output, model_runner_output)[0]
         all_done = True
         for eco in ecos.outputs:
             if eco.finish_reason is None:
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index ae1d8a762..e78c7480a 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -88,7 +88,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         assert len(engine_core.scheduler.running) == 4
 
         # Loop through until they are all done.
-        while len(engine_core.step()[0].outputs) > 0:
+        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
             pass
 
         assert len(engine_core.scheduler.waiting) == 0
@@ -163,11 +163,11 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         req0.request_id = req1.request_id = "test"
         engine_core.add_request(req0)
 
-        while len(engine_core.step()[0].outputs) > 0:
+        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
             pass
 
         engine_core.add_request(req1)
-        while len(engine_core.step()[0].outputs) > 0:
+        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
             pass
 
         assert len(engine_core.scheduler.waiting) == 0
@@ -207,7 +207,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
             assert len(engine_core.scheduler.waiting) == 1
             assert len(engine_core.scheduler.running) == 0
             # Loop through until they are all done.
-            while len(engine_core.step()[0].outputs) > 0:
+            while (outs := engine_core.step()[0].get(0)) and outs.outputs:
                 pass
             assert len(engine_core.scheduler.waiting) == 0
             assert len(engine_core.scheduler.running) == 0
@@ -327,7 +327,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         assert scheduler_output.num_scheduled_tokens[1] == 4
 
         # Batch queue is full. Finish Batch 2. Get first token of req0.
-        output = engine_core.step_with_batch_queue()[0]
+        output = engine_core.step_with_batch_queue()[0].get(0)
         assert output is not None
         assert len(output.outputs) == 1
         assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
@@ -339,7 +339,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         assert scheduler_output.num_scheduled_tokens[0] == 1
 
         # Batch queue is full. Finish Batch 3. Get first token of req1.
-        output = engine_core.step_with_batch_queue()[0]
+        output = engine_core.step_with_batch_queue()[0].get(0)
         assert output is not None
         assert len(output.outputs) == 1
         assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
@@ -362,7 +362,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
             if step % 2 == 0:
                 # Even steps consumes an output.
                 assert output is not None
-                assert len(output.outputs) == 1
+                assert len(output[0].outputs) == 1
                 if req_id in engine_core.scheduler.requests:
                     assert engine_core.scheduler.requests[
                         req_id].num_tokens == expected_num_tokens[req_id]
diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py
new file mode 100644
index 000000000..7b4583bc3
--- /dev/null
+++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+import os
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "ibm-research/PowerMoE-3b"
+
+DP_SIZE = os.getenv("DP_SIZE", "1")
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        "--api-server-count",
+        "4",
+        "--data_parallel_size",
+        DP_SIZE,
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(default_server_args):
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_single_completion(client: openai.AsyncOpenAI,
+                                 model_name: str) -> None:
+
+    async def make_request():
+        completion = await client.completions.create(
+            model=model_name,
+            prompt="Hello, my name is",
+            max_tokens=10,
+            temperature=1.0)
+
+        assert completion.id is not None
+        assert completion.choices is not None and len(completion.choices) == 1
+
+        choice = completion.choices[0]
+        # The exact number of tokens can vary slightly with temperature=1.0,
+        # so we check for a reasonable minimum length.
+        assert len(choice.text) >= 1
+        # Finish reason might not always be 'length' if the model finishes early
+        # or due to other reasons, especially with high temperature.
+        # So, we'll accept 'length' or 'stop'.
+        assert choice.finish_reason in ("length", "stop")
+
+        # Token counts can also vary, so we check they are positive.
+        assert completion.usage.completion_tokens > 0
+        assert completion.usage.prompt_tokens > 0
+        assert completion.usage.total_tokens > 0
+        return completion
+
+    # Test single request
+    result = await make_request()
+    assert result is not None
+
+    await asyncio.sleep(0.5)
+
+    # Send two bursts of requests
+    num_requests = 100
+    tasks = [make_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*tasks)
+    assert len(results) == num_requests
+    assert all(completion is not None for completion in results)
+
+    await asyncio.sleep(0.5)
+
+    tasks = [make_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*tasks)
+    assert len(results) == num_requests
+    assert all(completion is not None for completion in results)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_completion_streaming(client: openai.AsyncOpenAI,
+                                    model_name: str) -> None:
+    prompt = "What is an LLM?"
+
+    async def make_streaming_request():
+        # Perform a non-streaming request to get the expected full output
+        single_completion = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        single_output = single_completion.choices[0].text
+
+        # Perform the streaming request
+        stream = await client.completions.create(model=model_name,
+                                                 prompt=prompt,
+                                                 max_tokens=5,
+                                                 temperature=0.0,
+                                                 stream=True)
+        chunks: list[str] = []
+        finish_reason_count = 0
+        last_chunk = None
+        async for chunk in stream:
+            chunks.append(chunk.choices[0].text)
+            if chunk.choices[0].finish_reason is not None:
+                finish_reason_count += 1
+            last_chunk = chunk  # Keep track of the last chunk
+
+        # finish reason should only return in the last block for OpenAI API
+        assert finish_reason_count == 1, (
+            "Finish reason should appear exactly once.")
+        assert last_chunk is not None, (
+            "Stream should have yielded at least one chunk.")
+        assert last_chunk.choices[
+            0].finish_reason == "length", "Finish reason should be 'length'."
+        # Check that the combined text matches the non-streamed version.
+        assert "".join(
+            chunks
+        ) == single_output, "Streamed output should match non-streamed output."
+        return True  # Indicate success for this request
+
+    # Test single request
+    result = await make_streaming_request()
+    assert result is not None
+
+    await asyncio.sleep(0.5)
+
+    # Send two bursts of requests
+    num_requests = 100
+    tasks = [make_streaming_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*tasks)
+
+    assert len(
+        results
+    ) == num_requests, f"Expected {num_requests} results, got {len(results)}"
+    assert all(results), "Not all streaming requests completed successfully."
+
+    await asyncio.sleep(0.5)
+
+    tasks = [make_streaming_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*tasks)
+
+    assert len(
+        results
+    ) == num_requests, f"Expected {num_requests} results, got {len(results)}"
+    assert all(results), "Not all streaming requests completed successfully."
diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
index 770981403..dc963251c 100644
--- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
@@ -43,7 +43,7 @@ def test_basic_lifecycle():
     # Ensure the request is finished after 1 tokens.
     assert request.is_finished()
     assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED
-    output = engine_core_outputs.outputs[0]
+    output = engine_core_outputs[0].outputs[0]
     assert output.finish_reason == FinishReason.LENGTH
     assert output.kv_transfer_params is not None
 
@@ -165,7 +165,7 @@ def test_prefix_cache_lifecycle():
     scheduler_output = scheduler.schedule()
     model_runner_output = create_model_runner_output(reqs=[request_remote])
     eco = scheduler.update_from_output(scheduler_output, model_runner_output)
-    kv_transfer_params = eco.outputs[0].kv_transfer_params
+    kv_transfer_params = eco[0].outputs[0].kv_transfer_params
 
     # Ensure we send all block ids, even if there is a cache hit.
     assert (len(
diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
index 6fcff0d62..86eacb693 100644
--- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
@@ -61,7 +61,7 @@ def test_basic_lifecycle():
     # (1c): update_from_output()
     engine_core_outputs = scheduler.update_from_output(scheduler_output,
                                                        model_runner_output)
-    assert len(engine_core_outputs.outputs) == 0
+    assert not engine_core_outputs or not engine_core_outputs[0].outputs
 
     # STEP (2):
     # (2a): schedule(): nothing happens!
@@ -112,7 +112,7 @@ def test_basic_lifecycle():
                                                        model_runner_output)
     scheduler.schedule()
 
-    outputs = engine_core_outputs.outputs
+    outputs = engine_core_outputs[0].outputs
     assert len(outputs) == 1
     output = outputs[0]
     assert output.finish_reason == FinishReason.STOP
@@ -335,7 +335,7 @@ def test_full_block_prompt():
                                                        model_runner_output)
     scheduler.schedule()
 
-    outputs = engine_core_outputs.outputs
+    outputs = engine_core_outputs[0].outputs
     assert len(outputs) == 1
     output = outputs[0]
     assert output.finish_reason == FinishReason.STOP
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 53e2d6fda..3c3190b32 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -153,7 +153,6 @@ def create_request(
         multi_modal_placeholders=None,
         multi_modal_hashes=None,
         eos_token_id=EOS_TOKEN_ID,
-        arrival_time=0,
     )
     req.kv_transfer_params = kv_transfer_params
     return req
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 957fec290..e65c97073 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -1,24 +1,35 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
+import os
 import signal
+import sys
 
 import uvloop
+import zmq
 
 import vllm.envs as envs
 from vllm import AsyncEngineArgs
 from vllm.entrypoints.cli.types import CLISubcommand
-from vllm.entrypoints.openai.api_server import run_server
+from vllm.entrypoints.openai.api_server import (run_server, run_server_worker,
+                                                setup_server)
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
                                               validate_parsed_serve_args)
 from vllm.entrypoints.utils import (VLLM_SERVE_PARSER_EPILOG,
                                     show_filtered_argument_or_group_from_help)
+from vllm.executor.multiproc_worker_utils import _add_prefix
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, get_tcp_uri
+from vllm.utils import FlexibleArgumentParser, get_tcp_uri, zmq_socket_ctx
+from vllm.v1.engine.coordinator import DPCoordinator
 from vllm.v1.engine.core import EngineCoreProc
 from vllm.v1.engine.core_client import CoreEngineProcManager
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
+from vllm.v1.utils import (APIServerProcessManager, CoreEngine,
+                           EngineZmqAddresses, get_engine_client_zmq_addr,
+                           wait_for_completion_or_failure,
+                           wait_for_engine_startup)
 
 logger = init_logger(__name__)
 
@@ -36,9 +47,12 @@ class ServeSubcommand(CLISubcommand):
         if hasattr(args, 'model_tag') and args.model_tag is not None:
             args.model = args.model_tag
 
-        if args.headless:
+        if args.headless or args.api_server_count < 1:
             run_headless(args)
+        elif args.api_server_count > 1:
+            run_multi_api_server(args)
         else:
+            # Single API server (this process).
             uvloop.run(run_server(args))
 
     def validate(self, args: argparse.Namespace) -> None:
@@ -69,6 +83,11 @@ class ServeSubcommand(CLISubcommand):
             type=int,
             default=0,
             help='Starting data parallel rank for secondary nodes.')
+        serve_parser.add_argument('--api-server-count',
+                                  '-asc',
+                                  type=int,
+                                  default=1,
+                                  help='How many API server processes to run.')
         serve_parser.add_argument(
             "--config",
             type=str,
@@ -91,23 +110,26 @@ def cmd_init() -> list[CLISubcommand]:
 
 def run_headless(args: argparse.Namespace):
 
+    if args.api_server_count > 1:
+        raise ValueError("api_server_count can't be set in headless mode")
+
     # Create the EngineConfig.
     engine_args = AsyncEngineArgs.from_cli_args(args)
     usage_context = UsageContext.OPENAI_API_SERVER
     vllm_config = engine_args.create_engine_config(usage_context=usage_context)
 
     if not envs.VLLM_USE_V1:
-        raise RuntimeError("Headless mode is only supported for V1")
+        raise ValueError("Headless mode is only supported for V1")
 
     parallel_config = vllm_config.parallel_config
     local_engine_count = parallel_config.data_parallel_size_local
     host = parallel_config.data_parallel_master_ip
     port = engine_args.data_parallel_rpc_port  # add to config too
-    input_address = get_tcp_uri(host, port)
+    handshake_address = get_tcp_uri(host, port)
 
     if local_engine_count <= 0:
-        raise RuntimeError("data_parallel_size_local must be > 0 in "
-                           "headless mode")
+        raise ValueError("data_parallel_size_local must be > 0 in "
+                         "headless mode")
 
     # Catch SIGTERM and SIGINT to allow graceful shutdown.
     def signal_handler(signum, frame):
@@ -119,7 +141,7 @@ def run_headless(args: argparse.Namespace):
 
     logger.info(
         "Launching %d data parallel engine(s) in headless mode, "
-        "with head node address %s.", local_engine_count, input_address)
+        "with head node address %s.", local_engine_count, handshake_address)
 
     # Create the engines.
     engine_manager = CoreEngineProcManager(
@@ -129,7 +151,7 @@ def run_headless(args: argparse.Namespace):
         local_start_index=0,
         vllm_config=vllm_config,
         on_head_node=False,
-        input_address=input_address,
+        handshake_address=handshake_address,
         executor_class=Executor.get_class(vllm_config),
         log_stats=not engine_args.disable_log_stats,
     )
@@ -139,3 +161,142 @@ def run_headless(args: argparse.Namespace):
     finally:
         logger.info("Shutting down.")
         engine_manager.close()
+
+
+def run_multi_api_server(args: argparse.Namespace):
+
+    assert not args.headless
+    num_api_servers = args.api_server_count
+    assert num_api_servers > 0
+
+    if num_api_servers > 1:
+        setup_multiprocess_prometheus()
+
+    listen_address, sock = setup_server(args)
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    usage_context = UsageContext.OPENAI_API_SERVER
+    vllm_config = engine_args.create_engine_config(usage_context=usage_context)
+    model_config = vllm_config.model_config
+
+    if num_api_servers > 1:
+        if not envs.VLLM_USE_V1:
+            raise ValueError("api_server_count > 1 is only supported for V1")
+
+        if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
+            raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used "
+                             "with api_server_count > 1")
+
+        if model_config.is_multimodal_model and not (
+                model_config.disable_mm_preprocessor_cache):
+            logger.warning(
+                "Multi-model preprocessor cache will be disabled for"
+                " api_server_count > 1")
+            model_config.disable_mm_preprocessor_cache = True
+
+    parallel_config = vllm_config.parallel_config
+
+    assert parallel_config.data_parallel_rank == 0
+
+    dp_size = parallel_config.data_parallel_size
+    local_engine_count = parallel_config.data_parallel_size_local
+    host = parallel_config.data_parallel_master_ip
+    local_only = local_engine_count == dp_size
+
+    # Set up input and output addresses.
+    input_addresses = [
+        get_engine_client_zmq_addr(local_only, host)
+        for _ in range(num_api_servers)
+    ]
+    output_addresses = [
+        get_engine_client_zmq_addr(local_only, host)
+        for _ in range(num_api_servers)
+    ]
+
+    addresses = EngineZmqAddresses(
+        inputs=input_addresses,
+        outputs=output_addresses,
+    )
+
+    # Set up coordinator for dp > 1.
+    coordinator = None
+    stats_update_address = None
+    if dp_size > 1:
+        coordinator = DPCoordinator(parallel_config)
+        addresses.coordinator_input, addresses.coordinator_output = (
+            coordinator.get_engine_socket_addresses())
+        stats_update_address = coordinator.get_stats_publish_address()
+        logger.info("Started DP Coordinator process (PID: %d)",
+                    coordinator.proc.pid)
+
+    handshake_address = get_engine_client_zmq_addr(
+        local_only, host, parallel_config.data_parallel_rpc_port)
+
+    with zmq_socket_ctx(handshake_address, zmq.ROUTER,
+                        bind=True) as handshake_socket:
+
+        # Start local engines.
+        if not local_engine_count:
+            local_engine_manager = None
+        else:
+            local_engine_manager = CoreEngineProcManager(
+                EngineCoreProc.run_engine_core,
+                vllm_config=vllm_config,
+                executor_class=Executor.get_class(vllm_config),
+                log_stats=not engine_args.disable_log_stats,
+                handshake_address=handshake_address,
+                on_head_node=True,
+                local_engine_count=local_engine_count,
+                start_index=0,
+                local_start_index=0)
+
+        # Start API servers using the manager
+        api_server_manager = APIServerProcessManager(
+            target_server_fn=run_api_server_worker_proc,
+            listen_address=listen_address,
+            sock=sock,
+            args=args,
+            num_servers=num_api_servers,
+            input_addresses=input_addresses,
+            output_addresses=output_addresses,
+            stats_update_address=stats_update_address)
+
+        # Wait for engine handshakes to complete.
+        core_engines = [
+            CoreEngine(index=i, local=(i < local_engine_count))
+            for i in range(dp_size)
+        ]
+        wait_for_engine_startup(
+            handshake_socket,
+            addresses,
+            core_engines,
+            parallel_config,
+            vllm_config.cache_config,
+            local_engine_manager,
+            coordinator.proc if coordinator else None,
+        )
+
+        # Wait for API servers
+        wait_for_completion_or_failure(
+            api_server_manager=api_server_manager,
+            local_engine_manager=local_engine_manager,
+            coordinator=coordinator)
+
+
+def run_api_server_worker_proc(listen_address,
+                               sock,
+                               args,
+                               client_config=None,
+                               **uvicorn_kwargs) -> None:
+    """Entrypoint for individual API server worker processes."""
+
+    # Add process-specific prefix to stdout and stderr.
+    from multiprocessing import current_process
+    process_name = current_process().name
+    pid = os.getpid()
+    _add_prefix(sys.stdout, process_name, pid)
+    _add_prefix(sys.stderr, process_name, pid)
+
+    uvloop.run(
+        run_server_worker(listen_address, sock, args, client_config,
+                          **uvicorn_kwargs))
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index b991cb3a4..1e7f88a6a 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -17,7 +17,7 @@ from contextlib import asynccontextmanager
 from functools import partial
 from http import HTTPStatus
 from json import JSONDecodeError
-from typing import Annotated, Optional
+from typing import Annotated, Any, Optional
 
 import prometheus_client
 import regex as re
@@ -26,6 +26,8 @@ from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
+from prometheus_client import make_asgi_app
+from prometheus_fastapi_instrumentator import Instrumentator
 from starlette.concurrency import iterate_in_threadpool
 from starlette.datastructures import State
 from starlette.routing import Mount
@@ -97,6 +99,7 @@ from vllm.transformers_utils.tokenizer import MistralTokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
                         is_valid_ipv6_address, set_ulimit)
+from vllm.v1.metrics.prometheus import get_prometheus_registry
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
@@ -142,14 +145,17 @@ async def lifespan(app: FastAPI):
 
 @asynccontextmanager
 async def build_async_engine_client(
-        args: Namespace) -> AsyncIterator[EngineClient]:
+    args: Namespace,
+    client_config: Optional[dict[str, Any]] = None,
+) -> AsyncIterator[EngineClient]:
 
     # Context manager to handle engine_client lifecycle
     # Ensures everything is shutdown and cleaned up on error/exit
     engine_args = AsyncEngineArgs.from_cli_args(args)
 
     async with build_async_engine_client_from_engine_args(
-            engine_args, args.disable_frontend_multiprocessing) as engine:
+            engine_args, args.disable_frontend_multiprocessing,
+            client_config) as engine:
         yield engine
 
 
@@ -157,6 +163,7 @@ async def build_async_engine_client(
 async def build_async_engine_client_from_engine_args(
     engine_args: AsyncEngineArgs,
     disable_frontend_multiprocessing: bool = False,
+    client_config: Optional[dict[str, Any]] = None,
 ) -> AsyncIterator[EngineClient]:
     """
     Create EngineClient, either:
@@ -179,12 +186,16 @@ async def build_async_engine_client_from_engine_args(
 
         from vllm.v1.engine.async_llm import AsyncLLM
         async_llm: Optional[AsyncLLM] = None
+        client_index = client_config.pop(
+            "client_index") if client_config else 0
         try:
             async_llm = AsyncLLM.from_vllm_config(
                 vllm_config=vllm_config,
                 usage_context=usage_context,
                 disable_log_requests=engine_args.disable_log_requests,
-                disable_log_stats=engine_args.disable_log_stats)
+                disable_log_stats=engine_args.disable_log_stats,
+                client_addresses=client_config,
+                client_index=client_index)
 
             # Don't keep the dummy data in memory
             await async_llm.reset_mm_cache()
@@ -318,22 +329,9 @@ class PrometheusResponse(Response):
 
 
 def mount_metrics(app: FastAPI):
-    # Lazy import for prometheus multiprocessing.
-    # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
-    # before prometheus_client is imported.
-    # See https://prometheus.github.io/client_python/multiprocess/
-    from prometheus_client import (REGISTRY, CollectorRegistry, make_asgi_app,
-                                   multiprocess)
-    from prometheus_fastapi_instrumentator import Instrumentator
-
-    registry = REGISTRY
-
-    prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None)
-    if prometheus_multiproc_dir_path is not None:
-        logger.debug("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR",
-                     prometheus_multiproc_dir_path)
-        registry = CollectorRegistry()
-        multiprocess.MultiProcessCollector(registry)
+    """Mount prometheus metrics to a FastAPI app."""
+
+    registry = get_prometheus_registry()
 
     # `response_class=PrometheusResponse` is needed to return an HTTP response
     # with header "Content-Type: text/plain; version=0.0.4; charset=utf-8"
@@ -1256,16 +1254,10 @@ def create_server_socket(addr: tuple[str, int]) -> socket.socket:
     return sock
 
 
-async def run_server(args, **uvicorn_kwargs) -> None:
-    logger.info("vLLM API server version %s", VLLM_VERSION)
-    log_non_default_args(args)
-
-    if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
-        ToolParserManager.import_tool_parser(args.tool_parser_plugin)
-
+def validate_api_server_args(args):
     valid_tool_parses = ToolParserManager.tool_parsers.keys()
     if args.enable_auto_tool_choice \
-        and args.tool_call_parser not in valid_tool_parses:
+            and args.tool_call_parser not in valid_tool_parses:
         raise KeyError(f"invalid tool call parser: {args.tool_call_parser} "
                        f"(chose from {{ {','.join(valid_tool_parses)} }})")
 
@@ -1276,6 +1268,19 @@ async def run_server(args, **uvicorn_kwargs) -> None:
             f"invalid reasoning parser: {args.reasoning_parser} "
             f"(chose from {{ {','.join(valid_reasoning_parses)} }})")
 
+
+def setup_server(args):
+    """Validate API server args, set up signal handler, create socket
+    ready to serve."""
+
+    logger.info("vLLM API server version %s", VLLM_VERSION)
+    log_non_default_args(args)
+
+    if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
+        ToolParserManager.import_tool_parser(args.tool_parser_plugin)
+
+    validate_api_server_args(args)
+
     # workaround to make sure that we bind the port before the engine is set up.
     # This avoids race conditions with ray.
     # see https://github.com/vllm-project/vllm/issues/8204
@@ -1292,22 +1297,41 @@ async def run_server(args, **uvicorn_kwargs) -> None:
 
     signal.signal(signal.SIGTERM, signal_handler)
 
-    async with build_async_engine_client(args) as engine_client:
+    addr, port = sock_addr
+    is_ssl = args.ssl_keyfile and args.ssl_certfile
+    host_part = f"[{addr}]" if is_valid_ipv6_address(
+        addr) else addr or "0.0.0.0"
+    listen_address = f"http{'s' if is_ssl else ''}://{host_part}:{port}"
+
+    return listen_address, sock
+
+
+async def run_server(args, **uvicorn_kwargs) -> None:
+    """Run a single-worker API server."""
+    listen_address, sock = setup_server(args)
+    await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
+
+
+async def run_server_worker(listen_address,
+                            sock,
+                            args,
+                            client_config=None,
+                            **uvicorn_kwargs) -> None:
+    """Run a single API server worker."""
+
+    if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
+        ToolParserManager.import_tool_parser(args.tool_parser_plugin)
+
+    server_index = client_config.get("client_index", 0) if client_config else 0
+
+    async with build_async_engine_client(args, client_config) as engine_client:
         app = build_app(args)
 
         vllm_config = await engine_client.get_vllm_config()
         await init_app_state(engine_client, vllm_config, app.state, args)
 
-        def _listen_addr(a: str) -> str:
-            if is_valid_ipv6_address(a):
-                return '[' + a + ']'
-            return a or "0.0.0.0"
-
-        is_ssl = args.ssl_keyfile and args.ssl_certfile
-        logger.info("Starting vLLM API server on http%s://%s:%d",
-                    "s" if is_ssl else "", _listen_addr(sock_addr[0]),
-                    sock_addr[1])
-
+        logger.info("Starting vLLM API server %d on %s", server_index,
+                    listen_address)
         shutdown_task = await serve_http(
             app,
             sock=sock,
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index afc8a8dc3..f1ae03097 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -229,6 +229,11 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
             self.add_adapter(lora)
 
     def add_adapter(self, lora_request: LoRARequest) -> bool:
+        # Note that this method is not thread-safe. It may be invoked multiple
+        # times for the same adapter when using multiple API servers.
+        # This is ok because it's currently only called from
+        # the single-threaded core engine loop.
+
         if lora_request.lora_int_id not in self.list_adapters():
             # Load the new adapter first to ensure it is actually valid, before
             # evicting any existing adapters.
diff --git a/vllm/utils.py b/vllm/utils.py
index 25e34446a..65d3579d5 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2420,6 +2420,7 @@ def make_zmq_socket(
     socket_type: Any,
     bind: Optional[bool] = None,
     identity: Optional[bytes] = None,
+    linger: Optional[int] = None,
 ) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
     """Make a ZMQ socket with the proper bind/connect semantics."""
 
@@ -2439,7 +2440,7 @@ def make_zmq_socket(
         buf_size = -1  # Use system default buffer size
 
     if bind is None:
-        bind = socket_type != zmq.PUSH
+        bind = socket_type not in (zmq.PUSH, zmq.SUB, zmq.XSUB)
 
     if socket_type in (zmq.PULL, zmq.DEALER, zmq.ROUTER):
         socket.setsockopt(zmq.RCVHWM, 0)
@@ -2452,6 +2453,9 @@ def make_zmq_socket(
     if identity is not None:
         socket.setsockopt(zmq.IDENTITY, identity)
 
+    if linger is not None:
+        socket.setsockopt(zmq.LINGER, linger)
+
     # Determine if the path is a TCP socket with an IPv6 address.
     # Enable IPv6 on the zmq socket if so.
     scheme, host, _ = split_zmq_path(path)
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index c17f80b6a..055ce4460 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -45,7 +45,7 @@ class SchedulerInterface(ABC):
         self,
         scheduler_output: "SchedulerOutput",
         model_runner_output: "ModelRunnerOutput",
-    ) -> "EngineCoreOutputs":
+    ) -> dict[int, "EngineCoreOutputs"]:
         """Update the scheduler state based on the model runner output.
 
         This method is called after the model runner has processed the scheduled
@@ -55,7 +55,8 @@ class SchedulerInterface(ABC):
         for each request.
 
         Returns:
-            A EngineCoreOutputs object containing the outputs for each request.
+            A dict of client index to EngineCoreOutputs object containing the
+            outputs for each request originating from that client.
         """
         raise NotImplementedError
 
@@ -126,6 +127,11 @@ class SchedulerInterface(ABC):
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def get_request_counts(self) -> tuple[int, int]:
+        """Returns (num_running_reqs, num_waiting_reqs)."""
+        raise NotImplementedError
+
     @abstractmethod
     def make_stats(self) -> Optional["SchedulerStats"]:
         """Make a SchedulerStats object for logging.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 4c6b3eea0..ce16a1ed5 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -58,7 +58,8 @@ class Scheduler(SchedulerInterface):
         # request ids should be included in the EngineCoreOutputs returned
         # by update_from_outputs(). This is currently used in the multi-engine
         # case to track request lifetimes efficiently.
-        self.include_finished_set = include_finished_set
+        self.finished_req_ids_dict: Optional[dict[int, set[str]]] = (
+            defaultdict(set) if include_finished_set else None)
 
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
@@ -693,7 +694,7 @@ class Scheduler(SchedulerInterface):
         self,
         scheduler_output: SchedulerOutput,
         model_runner_output: ModelRunnerOutput,
-    ) -> EngineCoreOutputs:
+    ) -> dict[int, EngineCoreOutputs]:
         sampled_token_ids = model_runner_output.sampled_token_ids
         spec_token_ids = model_runner_output.spec_token_ids
         logprobs = model_runner_output.logprobs
@@ -701,7 +702,7 @@ class Scheduler(SchedulerInterface):
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
 
         new_running: list[Request] = []
-        outputs: list[EngineCoreOutput] = []
+        outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
         spec_decoding_stats: Optional[SpecDecodingStats] = None
 
         # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
@@ -797,7 +798,7 @@ class Scheduler(SchedulerInterface):
             if new_token_ids or kv_transfer_params:
 
                 # Add EngineCoreOutput for this Request.
-                outputs.append(
+                outputs[request.client_index].append(
                     EngineCoreOutput(
                         request_id=req_id,
                         new_token_ids=new_token_ids,
@@ -828,17 +829,38 @@ class Scheduler(SchedulerInterface):
                 self._cached_reqs_data[req_data.req_id].append(req_data)
 
         self.running = new_running
-        engine_core_outputs = EngineCoreOutputs(
-            outputs=outputs,
-            scheduler_stats=self.make_stats(spec_decoding_stats),
-        )
-        if self.include_finished_set:
-            #TODO currently sending duplicates here, improve this
-            engine_core_outputs.finished_requests = (
-                scheduler_output.finished_req_ids | self.finished_req_ids)
+
+        # Create EngineCoreOutputs for all clients that have requests with
+        # outputs in this step.
+        engine_core_outputs = {
+            client_index: EngineCoreOutputs(outputs=outs)
+            for client_index, outs in outputs.items()
+        }
+
+        finished_req_ids = self.finished_req_ids_dict
+        if finished_req_ids is not None:
+            # Include ids of requests that finished since last outputs
+            # were sent.
+            for client_index, finished_set in finished_req_ids.items():
+                # Set finished request set in EngineCoreOutputs for this client.
+                if (eco := engine_core_outputs.get(client_index)) is not None:
+                    eco.finished_requests = finished_set
+                else:
+                    engine_core_outputs[client_index] = EngineCoreOutputs(
+                        finished_requests=finished_set)
+            finished_req_ids.clear()
+
+        if engine_core_outputs:
+            # Return stats to only one of the front-ends.
+            next(iter(engine_core_outputs.values())).scheduler_stats = (
+                self.make_stats(spec_decoding_stats))
 
         return engine_core_outputs
 
+    def get_request_counts(self) -> tuple[int, int]:
+        """Returns (num_running_reqs, num_waiting_reqs)."""
+        return len(self.running), len(self.waiting)
+
     def add_request(self, request: Request) -> None:
         self.waiting.append(request)
         self.requests[request.request_id] = request
@@ -880,8 +902,11 @@ class Scheduler(SchedulerInterface):
 
         delay_free_blocks, kv_xfer_params = self._connector_finished(request)
         self.encoder_cache_manager.free(request)
-        self._cached_reqs_data.pop(request.request_id, None)
-        self.finished_req_ids.add(request.request_id)
+        request_id = request.request_id
+        self._cached_reqs_data.pop(request_id, None)
+        self.finished_req_ids.add(request_id)
+        if self.finished_req_ids_dict is not None:
+            self.finished_req_ids_dict[request.client_index].add(request_id)
 
         if not delay_free_blocks:
             self._free_blocks(request)
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 41db99bea..0c9f61a76 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -44,10 +44,6 @@ class EngineCoreRequest(
         omit_defaults=True,  # type: ignore[call-arg]
         gc=False):  # type: ignore[call-arg]
 
-    # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
-    # but this object is currently not playing well with msgspec
-    # due to circular imports and typing we have in data.py
-
     request_id: str
     prompt_token_ids: list[int]
     mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]]
@@ -59,6 +55,10 @@ class EngineCoreRequest(
     lora_request: Optional[LoRARequest]
     cache_salt: Optional[str]
 
+    # Index of the client, used to ensure outputs are sent back to the same
+    # client for this request when scaling out the front-end.
+    client_index: int = 0
+
     # Used in DP case to indicate which wave of requests this is expected to
     # belong to, to cover a race condition where the request is sent before
     # a wave finished notification is received.
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 74c2251c7..86781e752 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -36,6 +36,7 @@ from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.metrics.loggers import (StatLoggerBase, StatLoggerFactory,
                                      setup_default_loggers)
+from vllm.v1.metrics.prometheus import shutdown_prometheus
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
@@ -54,6 +55,8 @@ class AsyncLLM(EngineClient):
         log_requests: bool = True,
         start_engine_loop: bool = True,
         stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        client_addresses: Optional[dict[str, str]] = None,
+        client_index: int = 0,
     ) -> None:
         """
         Create an AsyncLLM.
@@ -124,6 +127,8 @@ class AsyncLLM(EngineClient):
             vllm_config=vllm_config,
             executor_class=executor_class,
             log_stats=self.log_stats,
+            client_addresses=client_addresses,
+            client_index=client_index,
         )
         if self.stat_loggers:
             for stat_logger in self.stat_loggers[0]:
@@ -145,6 +150,8 @@ class AsyncLLM(EngineClient):
         stat_loggers: Optional[list[StatLoggerFactory]] = None,
         disable_log_requests: bool = False,
         disable_log_stats: bool = False,
+        client_addresses: Optional[dict[str, str]] = None,
+        client_index: int = 0,
     ) -> "AsyncLLM":
         if not envs.VLLM_USE_V1:
             raise ValueError(
@@ -162,6 +169,8 @@ class AsyncLLM(EngineClient):
             log_requests=not disable_log_requests,
             log_stats=not disable_log_stats,
             usage_context=usage_context,
+            client_addresses=client_addresses,
+            client_index=client_index,
         )
 
     @classmethod
@@ -195,6 +204,8 @@ class AsyncLLM(EngineClient):
     def shutdown(self):
         """Shutdown, cleaning up the background proc and IPC."""
 
+        shutdown_prometheus()
+
         if engine_core := getattr(self, "engine_core", None):
             engine_core.shutdown()
 
@@ -398,7 +409,6 @@ class AsyncLLM(EngineClient):
                     # TODO(rob): make into a coroutine and launch it in
                     # background thread once Prometheus overhead is non-trivial.
                     if stat_loggers:
-                        assert outputs.scheduler_stats is not None
                         AsyncLLM._record_stats(
                             stat_loggers[outputs.engine_index],
                             scheduler_stats=outputs.scheduler_stats,
@@ -422,7 +432,7 @@ class AsyncLLM(EngineClient):
     @staticmethod
     def _record_stats(
         stat_loggers: list[StatLoggerBase],
-        scheduler_stats: SchedulerStats,
+        scheduler_stats: Optional[SchedulerStats],
         iteration_stats: Optional[IterationStats],
     ):
         """static so that it can be used from the output_handler task
diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py
new file mode 100644
index 000000000..b84d4b144
--- /dev/null
+++ b/vllm/v1/engine/coordinator.py
@@ -0,0 +1,252 @@
+# SPDX-License-Identifier: Apache-2.0
+import multiprocessing
+import time
+import weakref
+from typing import Optional
+
+import msgspec.msgpack
+import zmq
+
+from vllm.config import ParallelConfig
+from vllm.logger import init_logger
+from vllm.utils import get_mp_context, get_open_zmq_ipc_path, make_zmq_socket
+from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequestType
+from vllm.v1.serial_utils import MsgpackDecoder
+from vllm.v1.utils import get_engine_client_zmq_addr, shutdown
+
+logger = init_logger(__name__)
+
+
+class DPCoordinator:
+    """Coordinator process used for data-parallel deployments (DP>1).
+
+    Intermediates between multiple DP engine rank processes and one or more
+    front-end API server processes.
+
+    * Collects stats from each DP engine (currently just waiting and running
+      queue lengths), and publishes these to all front-ends for use in
+      load-balancing decisions.
+
+    * Keeps track of the current DP "request wave" number and running state
+      of the engines. This is received from the DP rank 0 engine and published
+      to the front-end processes along with the current load stats.
+
+      The engines alternate between a global running/paused state. The global
+      "request wave" number is a count of the number of times that the workers
+      collectively move from a running state to a paused state. This transition
+      is synchronized via the all-reduce operation performed in the
+      DPEngineCoreProc._has_global_unfinished_reqs method.
+
+    * Broadcasts the START_DP_WAVE message to engines to move them from paused
+      to running state when one engine receives a new request. This can happen
+      in two cases:
+      1) A front-end sending a new request while the engines are paused will
+         concurrently notify the coordinator.
+      2) An engine receiving a request for a stale request wave while in paused
+         state will notify the coordinator.
+
+    Engines will move into running state when receiving a new request or
+    START_DP_WAVE message.
+    """
+
+    def __init__(self, parallel_config: ParallelConfig):
+
+        # Assume coordinator is colocated with front-end procs.
+        front_publish_address = get_open_zmq_ipc_path()
+
+        dp_size = parallel_config.data_parallel_size
+        assert dp_size > 1, "Coordinator only used for data parallel"
+
+        local_only = dp_size == parallel_config.data_parallel_size_local
+        host = parallel_config.data_parallel_master_ip
+        back_publish_address = get_engine_client_zmq_addr(local_only, host)
+        back_output_address = get_engine_client_zmq_addr(local_only, host)
+
+        context = get_mp_context()
+        self.proc: multiprocessing.Process = context.Process(
+            target=CoordinatorProc.run_coordinator,
+            name="VLLM_DP_Coordinator",
+            kwargs={
+                "engine_count": parallel_config.data_parallel_size,
+                "front_publish_address": front_publish_address,
+                "back_output_address": back_output_address,
+                "back_publish_address": back_publish_address,
+            },
+            daemon=True)
+        self.proc.start()
+
+        self.stats_publish_address = front_publish_address
+        self.coord_in_address = back_publish_address
+        self.coord_out_address = back_output_address
+        self._finalizer = weakref.finalize(self, shutdown, [self.proc])
+
+    def get_stats_publish_address(self) -> str:
+        return self.stats_publish_address
+
+    def get_engine_socket_addresses(self) -> tuple[str, str]:
+        """Returns tuple of ZMQ input address, output address."""
+        return self.coord_in_address, self.coord_out_address
+
+    def close(self):
+        self._finalizer()
+
+
+class EngineState:
+
+    def __init__(self):
+        self.request_counts = [0, 0]  # [waiting, running]
+
+
+class CoordinatorProc:
+
+    def __init__(self, engine_count: int):
+
+        self.ctx = zmq.Context()
+
+        self.engines = [EngineState() for _ in range(engine_count)]
+
+        self.current_wave = 0
+        self.engines_running = False
+        self.stats_changed = False
+
+    @staticmethod
+    def run_coordinator(
+        engine_count: int,
+        front_publish_address: str,
+        back_output_address: str,
+        back_publish_address: str,
+    ):
+        coordinator = CoordinatorProc(engine_count=engine_count)
+        try:
+            coordinator.process_input_socket(
+                front_publish_address,
+                back_output_address,
+                back_publish_address,
+            )
+        except KeyboardInterrupt:
+            logger.info("DP Coordinator process exiting")
+
+    def process_input_socket(self, front_publish_address: str,
+                             back_output_address: str,
+                             back_publish_address: str):
+
+        decoder = MsgpackDecoder(EngineCoreOutputs)
+
+        with make_zmq_socket(
+                path=front_publish_address,  # IPC
+                ctx=self.ctx,
+                socket_type=zmq.XPUB,
+                bind=True,
+        ) as publish_front, make_zmq_socket(
+                path=back_output_address,  # IPC or TCP
+                ctx=self.ctx,
+                socket_type=zmq.PULL,
+                bind=True,
+        ) as output_back, make_zmq_socket(
+                path=back_publish_address,  # IPC or TCP
+                ctx=self.ctx,
+                socket_type=zmq.XPUB,
+                bind=True,
+        ) as publish_back:
+
+            poller = zmq.Poller()
+            poller.register(publish_front, zmq.POLLIN)
+            poller.register(output_back, zmq.POLLIN)
+            last_publish_time = 0
+            while True:
+                elapsed = int(time.time() * 1000) - last_publish_time
+                # Send at 100 ms interval if the stats have changed,
+                # or otherwise every 3 seconds.
+                wait_for = 100 if self.stats_changed else 3000
+                events = poller.poll(timeout=max(0, wait_for - elapsed))
+                if not events:
+                    # Poller timeout - publish current stats to front-ends.
+                    engine_req_counts_list = self._get_engine_counts()
+                    to_publish = (engine_req_counts_list, self.current_wave,
+                                  self.engines_running)
+                    publish_front.send(msgspec.msgpack.encode(to_publish))
+                    last_publish_time = int(time.time() * 1000)
+                    self.stats_changed = False
+                    continue
+
+                events = dict(events)
+
+                if publish_front in events:
+                    buffer = publish_front.recv()
+                    if buffer == b'\x01':
+                        # Ignore subscription messages.
+                        continue
+
+                    # We received a message on the front-end XPUB socket,
+                    # from an API server sending a new request while the
+                    # engines are paused, so that we can wake the other
+                    # engines.
+                    engine_to_exclude, wave = msgspec.msgpack.decode(buffer)
+                    if wave < self.current_wave:
+                        # If the wave number is stale, ensure the message is
+                        # handled by all the engines.
+                        engine_to_exclude = None
+                    if not self.engines_running:
+                        self.engines_running = True
+                        self.stats_changed = True
+                        self._send_start_wave(publish_back, self.current_wave,
+                                              engine_to_exclude)
+
+                if output_back in events:
+                    # We received a message from one of the engines.
+
+                    buffer = output_back.recv()
+                    outputs: EngineCoreOutputs = decoder.decode(buffer)
+
+                    assert not outputs.outputs
+                    assert outputs.utility_output is None
+
+                    eng_index = outputs.engine_index
+                    if outputs.scheduler_stats:
+                        # 1. Updated request load stats - update our local
+                        # state with these.
+                        stats = self.engines[eng_index].request_counts
+                        stats[0] = outputs.scheduler_stats.num_waiting_reqs
+                        stats[1] = outputs.scheduler_stats.num_running_reqs
+                        self.stats_changed = True
+
+                    if (wave := outputs.wave_complete) is not None:
+                        # 2. Notification from rank 0 engine that we've
+                        # moved into the global paused state
+                        # (engines_running==False)
+                        if self.current_wave <= wave:
+                            logger.debug("Moving DP wave from %d to %d.",
+                                         self.current_wave, wave)
+                            self.current_wave = wave + 1
+                            self.engines_running = False
+                            self.stats_changed = True
+                    elif (wave := outputs.start_wave) is not None and (
+                            wave > self.current_wave or
+                        (wave == self.current_wave
+                         and not self.engines_running)):
+                        # 3. The engine received request for a non-current wave
+                        # so we must ensure that other engines progress to the
+                        # next wave (race condition handling).
+                        logger.debug(
+                            "Starting wave %d after notification of "
+                            "stale wave request from engine.", wave)
+                        self.current_wave = wave
+                        self.engines_running = True
+                        self.stats_changed = True
+                        self._send_start_wave(publish_back, wave, eng_index)
+
+    @staticmethod
+    def _send_start_wave(socket: zmq.Socket, wave: int,
+                         exclude_engine_index: Optional[int]):
+        """Broadcast the START_DP_WAVE message to all the engines.
+        It includes the current wave number and index of engine which
+        has already received a request with this wave number and so doesn't
+        require additional notification.
+        """
+        wave_encoded = msgspec.msgpack.encode((wave, exclude_engine_index))
+        socket.send_multipart(
+            (EngineCoreRequestType.START_DP_WAVE.value, wave_encoded))
+
+    def _get_engine_counts(self) -> list[list[int]]:
+        """Return list of [waiting, running] count lists for each engine."""
+        return [e.request_counts for e in self.engines]
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index ed71d9b67..a02abb62b 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -7,6 +7,7 @@ import threading
 import time
 from collections import deque
 from concurrent.futures import Future
+from contextlib import ExitStack
 from inspect import isclass, signature
 from logging import DEBUG
 from typing import Any, Callable, Optional, TypeVar, Union
@@ -22,7 +23,7 @@ from vllm.logging_utils.dump_input import dump_engine_exception
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
-from vllm.utils import make_zmq_socket, resolve_obj_by_qualname, zmq_socket_ctx
+from vllm.utils import make_zmq_socket, resolve_obj_by_qualname
 from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
                                          unify_kv_cache_configs)
 from vllm.v1.core.sched.interface import SchedulerInterface
@@ -33,10 +34,12 @@ from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
 from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 from vllm.v1.structured_output import StructuredOutputManager
+from vllm.v1.utils import EngineHandshakeMetadata, EngineZmqAddresses
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -211,7 +214,7 @@ class EngineCore:
             # Re-raise exception
             raise err
 
-    def step(self) -> tuple[EngineCoreOutputs, bool]:
+    def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
         """Schedule, execute, and make output.
 
         Returns tuple of outputs and a flag indicating whether the model
@@ -221,10 +224,7 @@ class EngineCore:
         # Check for any requests remaining in the scheduler - unfinished,
         # or finished and not yet removed from the batch.
         if not self.scheduler.has_requests():
-            return EngineCoreOutputs(
-                outputs=[],
-                scheduler_stats=self.scheduler.make_stats(),
-            ), False
+            return {}, False
         scheduler_output = self.scheduler.schedule()
         model_output = self.execute_model(scheduler_output)
         engine_core_outputs = self.scheduler.update_from_output(
@@ -234,7 +234,7 @@ class EngineCore:
                 scheduler_output.total_num_scheduled_tokens > 0)
 
     def step_with_batch_queue(
-            self) -> tuple[Optional[EngineCoreOutputs], bool]:
+            self) -> tuple[Optional[dict[int, EngineCoreOutputs]], bool]:
         """Schedule and execute batches with the batch queue.
         Note that if nothing to output in this step, None is returned.
 
@@ -276,8 +276,8 @@ class EngineCore:
             # Blocking until the first result is available.
             model_output = future.result()
             self.batch_queue.task_done()
-            engine_core_outputs = self.scheduler.update_from_output(
-                scheduler_output, model_output)
+            engine_core_outputs = (self.scheduler.update_from_output(
+                scheduler_output, model_output))
 
         return engine_core_outputs, scheduled_batch
 
@@ -362,7 +362,7 @@ class EngineCoreProc(EngineCore):
         self,
         vllm_config: VllmConfig,
         on_head_node: bool,
-        input_address: str,
+        handshake_address: str,
         executor_class: type[Executor],
         log_stats: bool,
         engine_index: int = 0,
@@ -375,65 +375,70 @@ class EngineCoreProc(EngineCore):
         # Create input socket.
         input_ctx = zmq.Context()
         identity = engine_index.to_bytes(length=2, byteorder="little")
-        input_socket = make_zmq_socket(input_ctx,
-                                       input_address,
-                                       zmq.DEALER,
-                                       identity=identity,
-                                       bind=False)
-        try:
+        with make_zmq_socket(input_ctx,
+                             handshake_address,
+                             zmq.DEALER,
+                             identity=identity,
+                             linger=5000,
+                             bind=False) as handshake_socket:
+
             # Register engine with front-end.
-            output_address = self.startup_handshake(
-                input_socket, on_head_node, vllm_config.parallel_config)
+            addresses = self.startup_handshake(handshake_socket, on_head_node,
+                                               vllm_config.parallel_config)
+            self.client_count = len(addresses.outputs)
 
             # Update config which may have changed from the handshake.
             vllm_config.__post_init__()
 
             # Set up data parallel environment.
+            self.has_coordinator = addresses.coordinator_output is not None
             self._init_data_parallel(vllm_config)
 
             # Initialize engine core and model.
             super().__init__(vllm_config, executor_class, log_stats,
                              executor_fail_callback)
 
+            self.engine_index = engine_index
             self.step_fn = (self.step if self.batch_queue is None else
                             self.step_with_batch_queue)
             self.engines_running = False
+            self.last_counts = (0, 0)
 
             # Send ready message.
             num_gpu_blocks = vllm_config.cache_config.num_gpu_blocks
-            input_socket.send(
+            handshake_socket.send(
                 msgspec.msgpack.encode({
                     "status": "READY",
                     "local": on_head_node,
                     "num_gpu_blocks": num_gpu_blocks,
                 }))
 
-            # Background Threads and Queues for IO. These enable us to
-            # overlap ZMQ socket IO with GPU since they release the GIL,
-            # and to overlap some serialization/deserialization with the
-            # model forward pass.
-            # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-            self.input_queue = input_queue
-            self.output_queue = queue.Queue[Union[EngineCoreOutputs, bytes]]()
-            threading.Thread(target=self.process_input_socket,
-                             args=(input_socket, ),
-                             daemon=True).start()
-            input_socket = None
-            self.output_thread = threading.Thread(
-                target=self.process_output_socket,
-                args=(output_address, engine_index),
-                daemon=True)
-            self.output_thread.start()
-        finally:
-            if input_socket is not None:
-                input_socket.close(linger=0)
+        # Background Threads and Queues for IO. These enable us to
+        # overlap ZMQ socket IO with GPU since they release the GIL,
+        # and to overlap some serialization/deserialization with the
+        # model forward pass.
+        # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
+        self.input_queue = input_queue
+        self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs],
+                                              bytes]]()
+        threading.Thread(target=self.process_input_sockets,
+                         args=(addresses.inputs, addresses.coordinator_input,
+                               identity),
+                         daemon=True).start()
+        self.output_thread = threading.Thread(
+            target=self.process_output_sockets,
+            args=(addresses.outputs, addresses.coordinator_output,
+                  engine_index),
+            daemon=True)
+        self.output_thread.start()
 
     @staticmethod
-    def startup_handshake(input_socket: zmq.Socket, on_head_node: bool,
-                          parallel_config: ParallelConfig) -> str:
+    def startup_handshake(
+            handshake_socket: zmq.Socket, on_head_node: bool,
+            parallel_config: ParallelConfig) -> EngineZmqAddresses:
 
         # Send registration message.
-        input_socket.send(
+        handshake_socket.send(
             msgspec.msgpack.encode({
                 "status": "HELLO",
                 "local": on_head_node,
@@ -441,22 +446,20 @@ class EngineCoreProc(EngineCore):
 
         # Receive initialization message.
         logger.info("Waiting for init message from front-end.")
-        if not input_socket.poll(timeout=HANDSHAKE_TIMEOUT_MINS * 60 * 1000):
+        if not handshake_socket.poll(timeout=HANDSHAKE_TIMEOUT_MINS * 60_000):
             raise RuntimeError("Did not receive response from front-end "
                                f"process within {HANDSHAKE_TIMEOUT_MINS} "
                                f"minutes")
-        init_bytes = input_socket.recv()
-        init_message = msgspec.msgpack.decode(init_bytes)
+        init_bytes = handshake_socket.recv()
+        init_message: EngineHandshakeMetadata = msgspec.msgpack.decode(
+            init_bytes, type=EngineHandshakeMetadata)
         logger.debug("Received init message: %s", init_message)
 
-        output_socket_address = init_message["output_socket_address"]
-        #TBD(nick) maybe replace IP with configured head node address
-
-        received_parallel_config = init_message["parallel_config"]
+        received_parallel_config = init_message.parallel_config
         for key, value in received_parallel_config.items():
             setattr(parallel_config, key, value)
 
-        return output_socket_address
+        return init_message.addresses
 
     @staticmethod
     def run_engine_core(*args,
@@ -528,7 +531,7 @@ class EngineCoreProc(EngineCore):
         """Exits when an engine step needs to be performed."""
 
         waited = False
-        while not self.engines_running and not (self.scheduler.has_requests()):
+        while not self.engines_running and not self.scheduler.has_requests():
             if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
                 logger.debug("EngineCore waiting for work.")
                 waited = True
@@ -549,8 +552,8 @@ class EngineCoreProc(EngineCore):
         # Step the engine core.
         outputs, model_executed = self.step_fn()
         # Put EngineCoreOutputs into the output queue.
-        if outputs is not None:
-            self.output_queue.put_nowait(outputs)
+        for output in (outputs.items() if outputs else ()):
+            self.output_queue.put_nowait(output)
 
         return model_executed
 
@@ -563,7 +566,7 @@ class EngineCoreProc(EngineCore):
         elif request_type == EngineCoreRequestType.ABORT:
             self.abort_requests(request)
         elif request_type == EngineCoreRequestType.UTILITY:
-            call_id, method_name, args = request
+            client_idx, call_id, method_name, args = request
             output = UtilityOutput(call_id)
             try:
                 method = getattr(self, method_name)
@@ -574,7 +577,7 @@ class EngineCoreProc(EngineCore):
                 output.failure_message = (f"Call to {method_name} method"
                                           f" failed: {str(e)}")
             self.output_queue.put_nowait(
-                EngineCoreOutputs(utility_output=output))
+                (client_idx, EngineCoreOutputs(utility_output=output)))
         elif request_type == EngineCoreRequestType.EXECUTOR_FAILED:
             raise RuntimeError("Executor failed.")
         else:
@@ -607,27 +610,68 @@ class EngineCoreProc(EngineCore):
             logger.fatal("vLLM shutdown signal from EngineCore failed "
                          "to send. Please report this issue.")
 
-    def process_input_socket(self, input_socket: zmq.Socket):
+    def process_input_sockets(self, input_addresses: list[str],
+                              coord_input_address: Optional[str],
+                              identity: bytes):
         """Input socket IO thread."""
 
         # Msgpack serialization decoding.
         add_request_decoder = MsgpackDecoder(EngineCoreRequest)
         generic_decoder = MsgpackDecoder()
 
-        while True:
-            # (RequestType, RequestData)
-            type_frame, *data_frames = input_socket.recv_multipart(copy=False)
-            request_type = EngineCoreRequestType(bytes(type_frame.buffer))
-
-            # Deserialize the request data.
-            decoder = add_request_decoder if (
-                request_type == EngineCoreRequestType.ADD) else generic_decoder
-            request = decoder.decode(data_frames)
-
-            # Push to input queue for core busy loop.
-            self.input_queue.put_nowait((request_type, request))
+        with ExitStack() as stack, zmq.Context() as ctx:
+            input_sockets = [
+                stack.enter_context(
+                    make_zmq_socket(ctx,
+                                    input_address,
+                                    zmq.DEALER,
+                                    identity=identity,
+                                    bind=False))
+                for input_address in input_addresses
+            ]
+            if coord_input_address is None:
+                coord_socket = None
+            else:
+                coord_socket = stack.enter_context(
+                    make_zmq_socket(ctx,
+                                    coord_input_address,
+                                    zmq.XSUB,
+                                    identity=identity,
+                                    bind=False))
+                # Send subscription message to coordinator.
+                coord_socket.send(b'\x01')
+
+            # Register sockets with poller.
+            poller = zmq.Poller()
+            for input_socket in input_sockets:
+                # Send initial message to each input socket - this is required
+                # before the front-end ROUTER socket can send input messages
+                # back to us.
+                input_socket.send(b'')
+                poller.register(input_socket, zmq.POLLIN)
+            if coord_socket is not None:
+                poller.register(coord_socket, zmq.POLLIN)
 
-    def process_output_socket(self, output_path: str, engine_index: int):
+            while True:
+                for input_socket, _ in poller.poll():
+                    # (RequestType, RequestData)
+                    type_frame, *data_frames = input_socket.recv_multipart(
+                        copy=False)
+                    request_type = EngineCoreRequestType(
+                        bytes(type_frame.buffer))
+
+                    # Deserialize the request data.
+                    decoder = add_request_decoder if (
+                        request_type
+                        == EngineCoreRequestType.ADD) else generic_decoder
+                    request = decoder.decode(data_frames)
+
+                    # Push to input queue for core busy loop.
+                    self.input_queue.put_nowait((request_type, request))
+
+    def process_output_sockets(self, output_paths: list[str],
+                               coord_output_path: Optional[str],
+                               engine_index: int):
         """Output socket IO thread."""
 
         # Msgpack serialization encoding.
@@ -641,30 +685,49 @@ class EngineCoreProc(EngineCore):
 
         # We must set linger to ensure the ENGINE_CORE_DEAD
         # message is sent prior to closing the socket.
-        with zmq_socket_ctx(output_path, zmq.constants.PUSH,
-                            linger=4000) as socket:
+        with ExitStack() as stack, zmq.Context() as ctx:
+            sockets = [
+                stack.enter_context(
+                    make_zmq_socket(ctx, output_path, zmq.PUSH, linger=4000))
+                for output_path in output_paths
+            ]
+            coord_socket = stack.enter_context(
+                make_zmq_socket(
+                    ctx, coord_output_path, zmq.PUSH, bind=False,
+                    linger=4000)) if coord_output_path is not None else None
+            max_reuse_bufs = len(sockets) + 1
+
             while True:
-                outputs = self.output_queue.get()
-                if outputs == EngineCoreProc.ENGINE_CORE_DEAD:
-                    socket.send(outputs, copy=False)
+                output = self.output_queue.get()
+                if output == EngineCoreProc.ENGINE_CORE_DEAD:
+                    for socket in sockets:
+                        socket.send(output)
                     break
-                assert not isinstance(outputs, bytes)
+                assert not isinstance(output, bytes)
+                client_index, outputs = output
                 outputs.engine_index = engine_index
 
+                if client_index == -1:
+                    # Don't reuse buffer for coordinator message
+                    # which will be very small.
+                    assert coord_socket is not None
+                    coord_socket.send_multipart(encoder.encode(outputs))
+                    continue
+
                 # Reclaim buffers that zmq is finished with.
                 while pending and pending[-1][0].done:
                     reuse_buffers.append(pending.pop()[2])
 
                 buffer = reuse_buffers.pop() if reuse_buffers else bytearray()
                 buffers = encoder.encode_into(outputs, buffer)
-                tracker = socket.send_multipart(buffers,
-                                                copy=False,
-                                                track=True)
+                tracker = sockets[client_index].send_multipart(buffers,
+                                                               copy=False,
+                                                               track=True)
                 if not tracker.done:
                     ref = outputs if len(buffers) > 1 else None
                     pending.appendleft((tracker, ref, buffer))
-                elif len(reuse_buffers) < 2:
-                    # Keep at most 2 buffers to reuse.
+                elif len(reuse_buffers) < max_reuse_bufs:
+                    # Limit the number of buffers to reuse.
                     reuse_buffers.append(buffer)
 
 
@@ -676,7 +739,7 @@ class DPEngineCoreProc(EngineCoreProc):
         self,
         vllm_config: VllmConfig,
         on_head_node: bool,
-        input_address: str,
+        handshake_address: str,
         executor_class: type[Executor],
         log_stats: bool,
     ):
@@ -691,10 +754,11 @@ class DPEngineCoreProc(EngineCoreProc):
         # Counts forward-passes of the model so that we can synchronize
         # finished with DP peers every N steps.
         self.counter = 0
+        self.current_wave = 0
 
         # Initialize the engine.
         dp_rank = vllm_config.parallel_config.data_parallel_rank
-        super().__init__(vllm_config, on_head_node, input_address,
+        super().__init__(vllm_config, on_head_node, handshake_address,
                          executor_class, log_stats, dp_rank)
 
     def _init_data_parallel(self, vllm_config: VllmConfig):
@@ -726,7 +790,6 @@ class DPEngineCoreProc(EngineCoreProc):
 
         self.dp_rank = dp_rank
         self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
-        self.current_wave = 0
 
     def shutdown(self):
         super().shutdown()
@@ -734,22 +797,23 @@ class DPEngineCoreProc(EngineCoreProc):
             stateless_destroy_torch_distributed_process_group(dp_group)
 
     def add_request(self, request: EngineCoreRequest):
-        if request.current_wave != self.current_wave:
+        if self.has_coordinator and request.current_wave != self.current_wave:
             if request.current_wave > self.current_wave:
                 self.current_wave = request.current_wave
             elif not self.engines_running:
                 # Request received for an already-completed wave, notify
                 # front-end that we need to start the next one.
                 self.output_queue.put_nowait(
-                    EngineCoreOutputs(start_wave=self.current_wave))
+                    (-1, EngineCoreOutputs(start_wave=self.current_wave)))
 
         super().add_request(request)
 
     def _handle_client_request(self, request_type: EngineCoreRequestType,
                                request: Any) -> None:
         if request_type == EngineCoreRequestType.START_DP_WAVE:
-            new_wave: int = request
-            if new_wave >= self.current_wave:
+            new_wave, exclude_eng_index = request
+            if exclude_eng_index != self.engine_index and (
+                    new_wave >= self.current_wave):
                 self.current_wave = new_wave
                 if not self.engines_running:
                     logger.debug("EngineCore starting idle loop for wave %d.",
@@ -758,6 +822,18 @@ class DPEngineCoreProc(EngineCoreProc):
         else:
             super()._handle_client_request(request_type, request)
 
+    def _maybe_publish_request_counts(self):
+        if not self.has_coordinator:
+            return
+
+        # Publish our request counts (if they've changed).
+        counts = self.scheduler.get_request_counts()
+        if counts != self.last_counts:
+            self.last_counts = counts
+            stats = SchedulerStats(*counts)
+            self.output_queue.put_nowait(
+                (-1, EngineCoreOutputs(scheduler_stats=stats)))
+
     def run_busy_loop(self):
         """Core busy loop of the EngineCore for data parallel case."""
 
@@ -768,6 +844,8 @@ class DPEngineCoreProc(EngineCoreProc):
 
             # 2) Step the engine core.
             executed = self._process_engine_step()
+            self._maybe_publish_request_counts()
+
             local_unfinished_reqs = self.scheduler.has_unfinished_requests()
             if not executed:
                 if not local_unfinished_reqs and not self.engines_running:
@@ -788,7 +866,8 @@ class DPEngineCoreProc(EngineCoreProc):
                     logger.debug("Wave %d finished, pausing engine loop.",
                                  self.current_wave)
                     self.output_queue.put_nowait(
-                        EngineCoreOutputs(wave_complete=self.current_wave))
+                        (-1,
+                         EngineCoreOutputs(wave_complete=self.current_wave)))
                 self.current_wave += 1
 
     def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 9f8a9b692..e9e2d2d8d 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -2,6 +2,7 @@
 import asyncio
 import contextlib
 import queue
+import sys
 import uuid
 import weakref
 from abc import ABC, abstractmethod
@@ -9,26 +10,28 @@ from collections import deque
 from collections.abc import Awaitable, Sequence
 from concurrent.futures import Future
 from dataclasses import dataclass
-from enum import Enum, auto
 from threading import Thread
 from typing import Any, Callable, Optional, TypeVar, Union
 
-import msgspec
+import msgspec.msgpack
 import zmq
 import zmq.asyncio
 
-from vllm.config import ParallelConfig, VllmConfig
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.utils import (get_open_port, get_open_zmq_inproc_path,
-                        get_open_zmq_ipc_path, get_tcp_uri, make_zmq_socket)
+from vllm.utils import (get_open_zmq_inproc_path, make_zmq_socket,
+                        zmq_socket_ctx)
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType, UtilityOutput)
+from vllm.v1.engine.coordinator import DPCoordinator
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.engine.exceptions import EngineDeadError
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
-from vllm.v1.utils import CoreEngineProcManager
+from vllm.v1.utils import (CoreEngine, CoreEngineProcManager,
+                           EngineZmqAddresses, get_engine_client_zmq_addr,
+                           wait_for_engine_startup)
 
 logger = init_logger(__name__)
 
@@ -36,8 +39,6 @@ AnyFuture = Union[asyncio.Future[Any], Future[Any]]
 
 _R = TypeVar('_R')  # Return type for collective_rpc
 
-STARTUP_POLL_PERIOD_MS = 10000
-
 
 class EngineCoreClient(ABC):
     """
@@ -207,7 +208,7 @@ class InprocClient(EngineCoreClient):
 
     def get_output(self) -> EngineCoreOutputs:
         outputs, _ = self.engine_core.step()
-        return outputs
+        return outputs.get(0) or EngineCoreOutputs()
 
     def add_request(self, request: EngineCoreRequest) -> None:
         self.engine_core.add_request(request)
@@ -266,24 +267,6 @@ class InprocClient(EngineCoreClient):
         return self.engine_core.collective_rpc(method, timeout, args, kwargs)
 
 
-class CoreEngineState(Enum):
-    NEW = auto()
-    CONNECTED = auto()
-    READY = auto()
-
-
-class CoreEngine:
-    """One per data parallel rank."""
-
-    def __init__(self, index: int = 0, local: bool = True):
-        self.local = local
-        self.index = index
-        self.identity = index.to_bytes(length=2, byteorder="little")
-
-        self.state = CoreEngineState.NEW
-        self.num_reqs_in_flight = 0
-
-
 @dataclass
 class BackgroundResources:
     """Used as a finalizer for clean shutdown, avoiding
@@ -291,9 +274,12 @@ class BackgroundResources:
 
     ctx: Union[zmq.Context]
     local_engine_manager: Optional[CoreEngineProcManager] = None
+    coordinator: Optional[DPCoordinator] = None
     output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
     input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
+    first_req_send_socket: Optional[zmq.asyncio.Socket] = None
     output_queue_task: Optional[asyncio.Task] = None
+    stats_update_task: Optional[asyncio.Task] = None
     shutdown_path: Optional[str] = None
 
     # Set if any of the engines are dead. Here so that the output
@@ -306,16 +292,21 @@ class BackgroundResources:
         self.engine_dead = True
         if self.local_engine_manager is not None:
             self.local_engine_manager.close()
+        if self.coordinator is not None:
+            self.coordinator.close()
 
         if self.output_queue_task is not None:
             self.output_queue_task.cancel()
+        if self.stats_update_task is not None:
+            self.stats_update_task.cancel()
 
         # ZMQ context termination can hang if the sockets
         # aren't explicitly closed first.
-        if self.output_socket is not None:
-            self.output_socket.close(linger=0)
-        if self.input_socket is not None:
-            self.input_socket.close(linger=0)
+        for socket in (self.output_socket, self.input_socket,
+                       self.first_req_send_socket):
+            if socket is not None:
+                socket.close(linger=0)
+
         if self.shutdown_path is not None:
             # We must ensure that the sync output socket is
             # closed cleanly in its own thread.
@@ -350,6 +341,7 @@ class MPClient(EngineCoreClient):
         vllm_config: VllmConfig,
         executor_class: type[Executor],
         log_stats: bool,
+        client_addresses: Optional[dict[str, str]] = None,
     ):
         self.vllm_config = vllm_config
         # Serialization setup.
@@ -369,8 +361,8 @@ class MPClient(EngineCoreClient):
         try:
             parallel_config = vllm_config.parallel_config
             local_engine_count = parallel_config.data_parallel_size_local
-            start_index = parallel_config.data_parallel_rank
             local_start_index = parallel_config.data_parallel_rank_local
+            dp_size = parallel_config.data_parallel_size
 
             # SPMD mode is where there is an LLM instance per DP rank and
             # one core engine per LLM, see
@@ -382,42 +374,53 @@ class MPClient(EngineCoreClient):
                     CoreEngine(index=local_start_index, local=True)
                 ]
             else:
-                assert start_index == 0
+                assert parallel_config.data_parallel_rank == 0
                 local_start_index = 0
                 self.core_engines = [
                     CoreEngine(index=i, local=(i < local_engine_count))
-                    for i in range(parallel_config.data_parallel_size)
+                    for i in range(dp_size)
                 ]
 
-            input_address, output_address = self._get_zmq_addresses(
-                parallel_config, spmd_mode)
+            local_only = spmd_mode or local_engine_count == dp_size
+
+            self.stats_update_address: Optional[str] = None
+            if client_addresses is not None:
+                input_address = client_addresses["input_address"]
+                output_address = client_addresses["output_address"]
+                self.stats_update_address = client_addresses.get(
+                    "stats_update_address")
+            else:
+                host = parallel_config.data_parallel_master_ip
+                input_address = get_engine_client_zmq_addr(local_only, host)
+                output_address = get_engine_client_zmq_addr(local_only, host)
 
             # Create input and output sockets.
             self.input_socket = self.resources.input_socket = make_zmq_socket(
                 self.ctx, input_address, zmq.ROUTER, bind=True)
-
             self.resources.output_socket = make_zmq_socket(
-                self.ctx, output_address, zmq.constants.PULL)
-            # Start local engines.
-            if local_engine_count:
-                # In server mode, start_index and local_start_index will
-                # both be 0.
-                self.resources.local_engine_manager = CoreEngineProcManager(
-                    EngineCoreProc.run_engine_core,
-                    vllm_config=vllm_config,
-                    executor_class=executor_class,
-                    log_stats=log_stats,
-                    input_address=input_address,
-                    on_head_node=True,
-                    local_engine_count=local_engine_count,
-                    start_index=start_index,
-                    local_start_index=local_start_index)
+                self.ctx, output_address, zmq.PULL)
+
+            if client_addresses is None:
+                self._init_engines_direct(vllm_config, local_only,
+                                          local_start_index, input_address,
+                                          output_address, executor_class,
+                                          log_stats)
+                coordinator = self.resources.coordinator
+                if coordinator:
+                    self.stats_update_address = (
+                        coordinator.get_stats_publish_address())
+
+            # Wait for ready messages from each engine on the input socket.
+            identities = set(e.identity for e in self.core_engines)
+            sync_input_socket = zmq.Socket.shadow(self.input_socket)
+            while identities:
+                if not sync_input_socket.poll(timeout=600_000):
+                    raise TimeoutError("Timed out waiting for engines to send"
+                                       "initial message on input socket.")
+                identity, _ = sync_input_socket.recv_multipart()
+                identities.remove(identity)
 
             self.core_engine = self.core_engines[0]
-
-            # Wait for engine core process(es) to start.
-            self._wait_for_engine_startup(output_address, parallel_config)
-
             self.utility_results: dict[int, AnyFuture] = {}
 
             # Request objects which may contain pytorch-allocated tensors
@@ -430,116 +433,67 @@ class MPClient(EngineCoreClient):
             if not success:
                 self._finalizer()
 
-    @staticmethod
-    def _get_zmq_addresses(parallel_config: ParallelConfig,
-                           spmd_mode: bool) -> tuple[str, str]:
-        """Returns (input_address, output_address)."""
-        dp_size = parallel_config.data_parallel_size
+    def _init_engines_direct(self, vllm_config: VllmConfig, local_only: bool,
+                             local_start_index: int, input_address: str,
+                             output_address: str,
+                             executor_class: type[Executor], log_stats: bool):
+        """Self-contained client mode, launch engine and coordinator process
+        as needed."""
+
+        parallel_config = vllm_config.parallel_config
         local_engine_count = parallel_config.data_parallel_size_local
+        start_index = parallel_config.data_parallel_rank
+        host = parallel_config.data_parallel_master_ip
 
-        if local_engine_count == dp_size or spmd_mode:
-            input_address = get_open_zmq_ipc_path()
-            output_address = get_open_zmq_ipc_path()
-        else:
-            host = parallel_config.data_parallel_master_ip
-            input_port = parallel_config.data_parallel_rpc_port
-            output_port = get_open_port()
-            input_address = get_tcp_uri(host, input_port)
-            output_address = get_tcp_uri(host, output_port)
-
-        return input_address, output_address
-
-    def _wait_for_engine_startup(self, output_address: str,
-                                 parallel_config: ParallelConfig):
-        # Get a sync handle to the socket which can be sync or async.
-        sync_input_socket = zmq.Socket.shadow(self.input_socket)
-
-        # Wait for engine core process(es) to send ready messages.
-        local_count = parallel_config.data_parallel_size_local
-        remote_count = len(self.core_engines) - local_count
-        # [local, remote] counts
-        conn_pending, start_pending = [local_count, remote_count], [0, 0]
-
-        poller = zmq.Poller()
-        poller.register(sync_input_socket, zmq.POLLIN)
-        proc_manager = self.resources.local_engine_manager
-        if proc_manager is not None:
-            for sentinel in proc_manager.sentinels():
-                poller.register(sentinel, zmq.POLLIN)
-        while any(conn_pending) or any(start_pending):
-            events = poller.poll(STARTUP_POLL_PERIOD_MS)
-            if not events:
-                if any(conn_pending):
-                    logger.debug(
-                        "Waiting for %d local, %d remote core engine proc(s) "
-                        "to connect.", *conn_pending)
-                if any(start_pending):
-                    logger.debug(
-                        "Waiting for %d local, %d remote core engine proc(s) "
-                        "to start.", *start_pending)
-                continue
-            if len(events) > 1 or events[0][0] != sync_input_socket:
-                # One of the local core processes exited.
-                finished = proc_manager.finished_procs(
-                ) if proc_manager else {}
-                raise RuntimeError("Engine core initialization failed. "
-                                   "See root cause above. "
-                                   f"Failed core proc(s): {finished}")
-
-            # Receive HELLO and READY messages from the input socket.
-            eng_identity, ready_msg_bytes = sync_input_socket.recv_multipart()
-            eng_index = int.from_bytes(eng_identity, byteorder="little")
-            engine = next(
-                (e for e in self.core_engines if e.identity == eng_identity),
-                None)
-            if engine is None:
-                raise RuntimeError(f"Message from engine with unexpected data "
-                                   f"parallel rank: {eng_index}")
-            msg = msgspec.msgpack.decode(ready_msg_bytes)
-            status, local = msg["status"], msg["local"]
-            if local != engine.local:
-                raise RuntimeError(f"{status} message from "
-                                   f"{'local' if local else 'remote'} "
-                                   f"engine {eng_index}, expected it to be "
-                                   f"{'local' if engine.local else 'remote'}")
-
-            if status == "HELLO" and engine.state == CoreEngineState.NEW:
-
-                # Send init message with DP config info.
-                init_message = self.encoder.encode({
-                    "output_socket_address": output_address,
-                    "parallel_config": {
-                        "data_parallel_master_ip":
-                        parallel_config.data_parallel_master_ip,
-                        "data_parallel_master_port":
-                        parallel_config.data_parallel_master_port,
-                        "data_parallel_size":
-                        parallel_config.data_parallel_size,
-                    },
-                })
-                sync_input_socket.send_multipart((eng_identity, *init_message),
-                                                 copy=False)
-                conn_pending[0 if local else 1] -= 1
-                start_pending[0 if local else 1] += 1
-                engine.state = CoreEngineState.CONNECTED
-            elif status == "READY" and (engine.state
-                                        == CoreEngineState.CONNECTED):
-                # Setup KV cache config with initialization state from
-                # engine core process. Sum values from all engines in DP case.
-                cache_config = self.vllm_config.cache_config
-                num_gpu_blocks = cache_config.num_gpu_blocks or 0
-                num_gpu_blocks += msg['num_gpu_blocks']
-                cache_config.num_gpu_blocks = num_gpu_blocks
-
-                start_pending[0 if local else 1] -= 1
-                engine.state = CoreEngineState.READY
-            else:
-                raise RuntimeError(f"Unexpected {status} message for "
-                                   f"{'local' if local else 'remote'} engine "
-                                   f"{eng_index} in {engine.state} state.")
+        if len(self.core_engines) > 1:
+            self.resources.coordinator = DPCoordinator(parallel_config)
+
+        handshake_address = get_engine_client_zmq_addr(
+            local_only, host, parallel_config.data_parallel_rpc_port)
 
-            logger.debug("%s from %s core engine process %s.", status,
-                         "local" if local else "remote", eng_index)
+        with zmq_socket_ctx(handshake_address, zmq.ROUTER,
+                            bind=True) as handshake_socket:
+
+            # Start local engines.
+            if local_engine_count:
+                # In server mode, start_index and local_start_index will
+                # both be 0.
+                self.resources.local_engine_manager = CoreEngineProcManager(
+                    EngineCoreProc.run_engine_core,
+                    vllm_config=vllm_config,
+                    executor_class=executor_class,
+                    log_stats=log_stats,
+                    handshake_address=handshake_address,
+                    on_head_node=True,
+                    local_engine_count=local_engine_count,
+                    start_index=start_index,
+                    local_start_index=local_start_index)
+
+            # Wait for engine core process(es) to start.
+            self._wait_for_engine_startup(handshake_socket, input_address,
+                                          output_address)
+
+    def _wait_for_engine_startup(self, handshake_socket: zmq.Socket,
+                                 input_address: str, output_address: str):
+        addresses = EngineZmqAddresses(
+            inputs=[input_address],
+            outputs=[output_address],
+        )
+
+        coordinator = self.resources.coordinator
+        if coordinator is not None:
+            addresses.coordinator_input, addresses.coordinator_output = (
+                coordinator.get_engine_socket_addresses())
+
+        wait_for_engine_startup(
+            handshake_socket,
+            addresses,
+            self.core_engines,
+            self.vllm_config.parallel_config,
+            self.vllm_config.cache_config,
+            self.resources.local_engine_manager,
+            coordinator.proc if coordinator else None,
+        )
 
     def shutdown(self):
         # Terminate background resources.
@@ -605,8 +559,8 @@ class SyncMPClient(MPClient):
             try:
                 shutdown_socket.bind(shutdown_path)
                 poller = zmq.Poller()
-                poller.register(shutdown_socket)
-                poller.register(out_socket)
+                poller.register(shutdown_socket, zmq.POLLIN)
+                poller.register(out_socket, zmq.POLLIN)
                 while True:
                     socks = poller.poll()
                     if not socks:
@@ -668,7 +622,7 @@ class SyncMPClient(MPClient):
         future: Future[Any] = Future()
         self.utility_results[call_id] = future
         self._send_input(EngineCoreRequestType.UTILITY,
-                         (call_id, method, args))
+                         (0, call_id, method, args))
 
         return future.result()
 
@@ -730,15 +684,21 @@ class SyncMPClient(MPClient):
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
 
-    def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
-                 log_stats: bool):
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: type[Executor],
+                 log_stats: bool,
+                 client_addresses: Optional[dict[str, str]] = None,
+                 client_index: int = 0):
         super().__init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
             log_stats=log_stats,
+            client_addresses=client_addresses,
         )
 
+        self.client_index = client_index
         self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs,
                                                  Exception]]()
         try:
@@ -854,12 +814,13 @@ class AsyncMPClient(MPClient):
         future = asyncio.get_running_loop().create_future()
         self.utility_results[call_id] = future
         message = (EngineCoreRequestType.UTILITY.value, *self.encoder.encode(
-            (call_id, method, args)))
+            (self.client_index, call_id, method, args)))
         await self._send_input_message(message, engine, args)
         self._ensure_output_queue_task()
         return await future
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
+        request.client_index = self.client_index
         await self._send_input(EngineCoreRequestType.ADD, request)
         self._ensure_output_queue_task()
 
@@ -921,17 +882,120 @@ class DPAsyncMPClient(AsyncMPClient):
     """Asyncio-compatible client for multi-proc, multi-engine (data parallel)
     EngineCore."""
 
-    def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
-                 log_stats: bool):
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: type[Executor],
+                 log_stats: bool,
+                 client_addresses: Optional[dict[str, str]] = None,
+                 client_index: int = 0):
 
         self.current_wave = 0
         self.engines_running = False
+        # To route aborts to the correct engine.
         self.reqs_in_flight: dict[str, CoreEngine] = {}
 
-        super().__init__(vllm_config, executor_class, log_stats)
+        super().__init__(vllm_config, executor_class, log_stats,
+                         client_addresses, client_index)
 
         assert len(self.core_engines) > 1
 
+        # List of [waiting, running] pair per engine.
+        self.lb_engines: list[list[int]] = []
+
+        self.first_req_sock_addr = get_open_zmq_inproc_path()
+        self.first_req_send_socket = self.resources.first_req_send_socket = (
+            make_zmq_socket(self.ctx,
+                            self.first_req_sock_addr,
+                            zmq.PAIR,
+                            bind=True))
+        try:
+            # If we are running in an asyncio event loop, start the stats task.
+            # Otherwise, it will be started lazily.
+            asyncio.get_running_loop()
+            self._ensure_stats_update_task()
+        except RuntimeError:
+            pass
+
+    def _ensure_stats_update_task(self):
+        resources = self.resources
+        if resources.stats_update_task is not None:
+            return
+
+        assert self.stats_update_address is not None
+
+        async def run_engine_stats_update_task():
+            with make_zmq_socket(self.ctx, self.stats_update_address,
+                                 zmq.XSUB) as socket, make_zmq_socket(
+                                     self.ctx,
+                                     self.first_req_sock_addr,
+                                     zmq.PAIR,
+                                     bind=False) as first_req_rcv_socket:
+                # Send subscription message.
+                await socket.send(b'\x01')
+
+                poller = zmq.asyncio.Poller()
+                poller.register(socket, zmq.POLLIN)
+                poller.register(first_req_rcv_socket, zmq.POLLIN)
+
+                while True:
+                    events = await poller.poll()
+                    if not self.engines_running and len(events) == 2 or (
+                            events[0][0] == first_req_rcv_socket):
+                        # Send a message to notify the coordinator that
+                        # we're sending a request while the engines are
+                        # paused, so that it can wake the others up
+                        # (to run dummy EP loop).
+                        self.engines_running = True
+                        buf = first_req_rcv_socket.recv(
+                            flags=zmq.NOBLOCK).result()
+                        target_eng_index = int.from_bytes(buf, "little")
+                        msg = msgspec.msgpack.encode(
+                            (target_eng_index, self.current_wave))
+                        await socket.send(msg)
+
+                    buf = None
+                    while True:
+                        # Drain all stats events (we only care about latest).
+                        future: asyncio.Future[bytes] = socket.recv(
+                            flags=zmq.NOBLOCK)
+                        if isinstance(future.exception(), zmq.Again):
+                            break
+                        buf = future.result()
+                    if buf is None:
+                        continue
+
+                    # Update local load-balancing state.
+                    counts, wave, running = msgspec.msgpack.decode(buf)
+                    self.current_wave = wave
+                    self.engines_running = running
+                    self.lb_engines = counts
+
+        resources.stats_update_task = asyncio.create_task(
+            run_engine_stats_update_task())
+
+    def get_core_engine_for_request(self) -> CoreEngine:
+        if not self.lb_engines:
+            return self.core_engines[0]
+        # TODO use P2C alg for larger DP sizes
+        num_engines = len(self.lb_engines)
+        min_counts = [sys.maxsize, sys.maxsize]
+        eng_index = 0
+        for i in range(num_engines):
+            # Start from client_index to help with balancing when engines
+            # are empty.
+            idx = (self.client_index + i) % num_engines
+            counts = self.lb_engines[idx]
+            if counts < min_counts:
+                min_counts = counts
+                eng_index = idx
+        # Adjust local counts for better balancing between stats updates
+        # from the coordinator (which happen every 100ms).
+        if min_counts[0]:
+            min_counts[0] += 1
+        else:
+            min_counts[1] += 1
+        return self.core_engines[eng_index]
+
     async def call_utility_async(self, method: str, *args) -> Any:
         # Only the result from the first engine is returned.
         return (await asyncio.gather(*[
@@ -940,62 +1004,30 @@ class DPAsyncMPClient(AsyncMPClient):
         ]))[0]
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
+        self._ensure_stats_update_task()
+
         request.current_wave = self.current_wave
+        request.client_index = self.client_index
 
         chosen_engine = self.get_core_engine_for_request()
         self.reqs_in_flight[request.request_id] = chosen_engine
-        chosen_engine.num_reqs_in_flight += 1
 
         to_await = self._send_input(EngineCoreRequestType.ADD, request,
                                     chosen_engine)
         if not self.engines_running:
-            # Send request to chosen engine and dp start loop
-            # control message to all other engines.
-            self.engines_running = True
-            to_await = asyncio.gather(
-                to_await,  # type: ignore[assignment]
-                *self._start_wave_coros(exclude_index=chosen_engine.index))
+            # Notify coordinator that we're sending a request
+            await self.first_req_send_socket.send(chosen_engine.identity)
 
         await to_await
 
         self._ensure_output_queue_task()
 
-    def get_core_engine_for_request(self) -> CoreEngine:
-        return min(self.core_engines, key=lambda e: e.num_reqs_in_flight)
-
     @staticmethod
     async def process_engine_outputs(self: "DPAsyncMPClient",
                                      outputs: EngineCoreOutputs):
-        if self.reqs_in_flight:
-            for req_id in outputs.finished_requests or ():
-                if engine := self.reqs_in_flight.pop(req_id, None):
-                    engine.num_reqs_in_flight -= 1
-
-        if outputs.wave_complete is not None:
-            # Current wave is complete, move to next wave number
-            # and mark engines as paused.
-            if self.current_wave <= outputs.wave_complete:
-                self.current_wave = outputs.wave_complete + 1
-                self.engines_running = False
-
-        elif outputs.start_wave is not None and (
-                outputs.start_wave > self.current_wave or
-            (outputs.start_wave == self.current_wave
-             and not self.engines_running)):
-            # Engine received request for a non-current wave so we must ensure
-            # that other engines progress to the next wave.
-            self.current_wave = outputs.start_wave
-            self.engines_running = True
-            await asyncio.gather(*self._start_wave_coros(
-                exclude_index=outputs.engine_index))
-
-    def _start_wave_coros(self, exclude_index: int) -> list[Awaitable[None]]:
-        logger.debug("Sending start DP wave %d.", self.current_wave)
-        return [
-            self._send_input(EngineCoreRequestType.START_DP_WAVE,
-                             self.current_wave, engine)
-            for engine in self.core_engines if engine.index != exclude_index
-        ]
+        if outputs.finished_requests and self.reqs_in_flight:
+            for req_id in outputs.finished_requests:
+                self.reqs_in_flight.pop(req_id, None)
 
     async def abort_requests_async(self, request_ids: list[str]) -> None:
         if not request_ids:
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 3dc2f7744..665e5873d 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -12,13 +12,12 @@ from vllm.config import SupportsMetricsInfo, VllmConfig
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
 from vllm.v1.engine import FinishReason
+from vllm.v1.metrics.prometheus import unregister_vllm_metrics
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm
 
 logger = init_logger(__name__)
 
-_LOCAL_LOGGING_INTERVAL_SEC = 5.0
-
 StatLoggerFactory = Callable[[VllmConfig, int], "StatLoggerBase"]
 
 
@@ -35,7 +34,7 @@ class StatLoggerBase(ABC):
         ...
 
     @abstractmethod
-    def record(self, scheduler_stats: SchedulerStats,
+    def record(self, scheduler_stats: Optional[SchedulerStats],
                iteration_stats: Optional[IterationStats]):
         ...
 
@@ -78,20 +77,22 @@ class LoggingStatLogger(StatLoggerBase):
         # Compute summary metrics for tracked stats
         return float(np.sum(tracked_stats) / (now - self.last_log_time))
 
-    def record(self, scheduler_stats: SchedulerStats,
+    def record(self, scheduler_stats: Optional[SchedulerStats],
                iteration_stats: Optional[IterationStats]):
         """Log Stats to standard output."""
 
         if iteration_stats:
             self._track_iteration_stats(iteration_stats)
 
-        self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
+        if scheduler_stats is not None:
+            self.prefix_caching_metrics.observe(
+                scheduler_stats.prefix_cache_stats)
 
-        if scheduler_stats.spec_decoding_stats is not None:
-            self.spec_decoding_logging.observe(
-                scheduler_stats.spec_decoding_stats)
+            if scheduler_stats.spec_decoding_stats is not None:
+                self.spec_decoding_logging.observe(
+                    scheduler_stats.spec_decoding_stats)
 
-        self.last_scheduler_stats = scheduler_stats
+            self.last_scheduler_stats = scheduler_stats
 
     def log(self):
         now = time.monotonic()
@@ -131,10 +132,11 @@ class LoggingStatLogger(StatLoggerBase):
         self.spec_decoding_logging.log(log_fn=log_fn)
 
     def log_engine_initialized(self):
-        logger.info(
-            "vllm cache_config_info with initialization " \
-            "after num_gpu_blocks is: %d",
-            self.vllm_config.cache_config.num_gpu_blocks)
+        if self.vllm_config.cache_config.num_gpu_blocks:
+            logger.info(
+                "Engine %03d: vllm cache_config_info with initialization "
+                "after num_gpu_blocks is: %d", self.engine_index,
+                self.vllm_config.cache_config.num_gpu_blocks)
 
 
 class PrometheusStatLogger(StatLoggerBase):
@@ -144,7 +146,8 @@ class PrometheusStatLogger(StatLoggerBase):
     _spec_decoding_cls = SpecDecodingProm
 
     def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
-        self._unregister_vllm_metrics()
+
+        unregister_vllm_metrics()
         self.vllm_config = vllm_config
         self.engine_index = engine_index
         # Use this flag to hide metrics that were deprecated in
@@ -169,11 +172,13 @@ class PrometheusStatLogger(StatLoggerBase):
         self.gauge_scheduler_running = self._gauge_cls(
             name="vllm:num_requests_running",
             documentation="Number of requests in model execution batches.",
+            multiprocess_mode="mostrecent",
             labelnames=labelnames).labels(*labelvalues)
 
         self.gauge_scheduler_waiting = self._gauge_cls(
             name="vllm:num_requests_waiting",
             documentation="Number of requests waiting to be processed.",
+            multiprocess_mode="mostrecent",
             labelnames=labelnames).labels(*labelvalues)
 
         #
@@ -182,6 +187,7 @@ class PrometheusStatLogger(StatLoggerBase):
         self.gauge_gpu_cache_usage = self._gauge_cls(
             name="vllm:gpu_cache_usage_perc",
             documentation="GPU KV-cache usage. 1 means 100 percent usage.",
+            multiprocess_mode="mostrecent",
             labelnames=labelnames).labels(*labelvalues)
 
         self.counter_gpu_prefix_cache_queries = self._counter_cls(
@@ -242,6 +248,9 @@ class PrometheusStatLogger(StatLoggerBase):
                 buckets=build_1_2_5_buckets(max_model_len),
                 labelnames=labelnames).labels(*labelvalues)
 
+        # TODO: This metric might be incorrect in case of using multiple
+        # api_server counts which uses prometheus mp.
+        # See: https://github.com/vllm-project/vllm/pull/18053
         self.histogram_iteration_tokens = \
             self._histogram_cls(
                 name="vllm:iteration_tokens_total",
@@ -340,6 +349,9 @@ class PrometheusStatLogger(StatLoggerBase):
         #
         # LoRA metrics
         #
+
+        # TODO: This metric might be incorrect in case of using multiple
+        # api_server counts which uses prometheus mp.
         self.gauge_lora_info: Optional[prometheus_client.Gauge] = None
         if vllm_config.lora_config is not None:
             self.labelname_max_lora = "max_lora"
@@ -350,13 +362,16 @@ class PrometheusStatLogger(StatLoggerBase):
                 self._gauge_cls(
                     name="vllm:lora_requests_info",
                     documentation="Running stats on lora requests.",
+                    multiprocess_mode="sum",
                     labelnames=[
                         self.labelname_max_lora,
                         self.labelname_waiting_lora_adapters,
                         self.labelname_running_lora_adapters,
-                    ])
+                    ],
+                )
 
     def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
+
         metrics_info = config_obj.metrics_info()
         metrics_info["engine"] = self.engine_index
 
@@ -372,25 +387,28 @@ class PrometheusStatLogger(StatLoggerBase):
         info_gauge = self._gauge_cls(
             name=name,
             documentation=documentation,
-            labelnames=metrics_info.keys()).labels(**metrics_info)
+            multiprocess_mode="mostrecent",
+            labelnames=metrics_info.keys(),
+        ).labels(**metrics_info)
         info_gauge.set(1)
 
-    def record(self, scheduler_stats: SchedulerStats,
+    def record(self, scheduler_stats: Optional[SchedulerStats],
                iteration_stats: Optional[IterationStats]):
         """Log to prometheus."""
-        self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
-        self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
+        if scheduler_stats is not None:
+            self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
+            self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
 
-        self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)
+            self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)
 
-        self.counter_gpu_prefix_cache_queries.inc(
-            scheduler_stats.prefix_cache_stats.queries)
-        self.counter_gpu_prefix_cache_hits.inc(
-            scheduler_stats.prefix_cache_stats.hits)
+            self.counter_gpu_prefix_cache_queries.inc(
+                scheduler_stats.prefix_cache_stats.queries)
+            self.counter_gpu_prefix_cache_hits.inc(
+                scheduler_stats.prefix_cache_stats.hits)
 
-        if scheduler_stats.spec_decoding_stats is not None:
-            self.spec_decoding_prom.observe(
-                scheduler_stats.spec_decoding_stats)
+            if scheduler_stats.spec_decoding_stats is not None:
+                self.spec_decoding_prom.observe(
+                    scheduler_stats.spec_decoding_stats)
 
         if iteration_stats is None:
             return
@@ -445,13 +463,6 @@ class PrometheusStatLogger(StatLoggerBase):
             self.gauge_lora_info.labels(**lora_info_labels)\
                                 .set_to_current_time()
 
-    @staticmethod
-    def _unregister_vllm_metrics():
-        # Unregister any existing vLLM collectors (for CI/CD
-        for collector in list(prometheus_client.REGISTRY._collector_to_names):
-            if hasattr(collector, "_name") and "vllm" in collector._name:
-                prometheus_client.REGISTRY.unregister(collector)
-
     def log_engine_initialized(self):
         self.log_metrics_info("cache_config", self.vllm_config.cache_config)
 
diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py
new file mode 100644
index 000000000..f12568535
--- /dev/null
+++ b/vllm/v1/metrics/prometheus.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import tempfile
+from typing import Optional
+
+from prometheus_client import REGISTRY, CollectorRegistry, multiprocess
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+# Global temporary directory for prometheus multiprocessing
+_prometheus_multiproc_dir: Optional[tempfile.TemporaryDirectory] = None
+
+
+def setup_multiprocess_prometheus():
+    """Set up prometheus multiprocessing directory if not already configured.
+    
+    """
+    global _prometheus_multiproc_dir
+
+    if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
+        # Make TemporaryDirectory for prometheus multiprocessing
+        # Note: global TemporaryDirectory will be automatically
+        # cleaned up upon exit.
+        _prometheus_multiproc_dir = tempfile.TemporaryDirectory()
+        os.environ["PROMETHEUS_MULTIPROC_DIR"] = _prometheus_multiproc_dir.name
+        logger.debug("Created PROMETHEUS_MULTIPROC_DIR at %s",
+                     _prometheus_multiproc_dir.name)
+    else:
+        logger.warning("Found PROMETHEUS_MULTIPROC_DIR was set by user. "
+                       "This directory must be wiped between vLLM runs or "
+                       "you will find inaccurate metrics. Unset the variable "
+                       "and vLLM will properly handle cleanup.")
+
+
+def get_prometheus_registry():
+    """Get the appropriate prometheus registry based on multiprocessing 
+    configuration.
+    
+    Returns:
+        Registry: A prometheus registry
+    """
+    if os.getenv("PROMETHEUS_MULTIPROC_DIR") is not None:
+        logger.debug("Using multiprocess registry for prometheus metrics")
+        registry = CollectorRegistry()
+        multiprocess.MultiProcessCollector(registry)
+        return registry
+
+    return REGISTRY
+
+
+def unregister_vllm_metrics():
+    """Unregister any existing vLLM collectors from the prometheus registry.
+    
+    This is useful for testing and CI/CD where metrics may be registered
+    multiple times across test runs.
+    
+    Also, in case of multiprocess, we need to unregister the metrics from the 
+    global registry.
+    """
+    registry = REGISTRY
+    # Unregister any existing vLLM collectors
+    for collector in list(registry._collector_to_names):
+        if hasattr(collector, "_name") and "vllm" in collector._name:
+            registry.unregister(collector)
+
+
+def shutdown_prometheus():
+    """Shutdown prometheus metrics."""
+    try:
+        pid = os.getpid()
+        multiprocess.mark_process_dead(pid)
+        logger.debug("Marked Prometheus metrics for process %d as dead", pid)
+    except Exception as e:
+        logger.error("Error during metrics cleanup: %s", str(e))
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index b4c845075..42c75ef96 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -26,12 +26,13 @@ class Request:
         multi_modal_placeholders: Optional[list[PlaceholderRange]],
         sampling_params: SamplingParams,
         eos_token_id: Optional[int],
-        arrival_time: float,
+        client_index: int = 0,
         lora_request: Optional["LoRARequest"] = None,
         structured_output_request: Optional["StructuredOutputRequest"] = None,
         cache_salt: Optional[str] = None,
     ) -> None:
         self.request_id = request_id
+        self.client_index = client_index
         self.sampling_params = sampling_params
         # Because of LoRA, the eos token id can be different for each request.
         self.eos_token_id = eos_token_id
@@ -90,13 +91,13 @@ class Request:
 
         return cls(
             request_id=request.request_id,
+            client_index=request.client_index,
             prompt_token_ids=request.prompt_token_ids,
             multi_modal_inputs=request.mm_inputs,
             multi_modal_hashes=request.mm_hashes,
             multi_modal_placeholders=request.mm_placeholders,
             sampling_params=request.sampling_params,
             eos_token_id=request.eos_token_id,
-            arrival_time=request.arrival_time,
             lora_request=request.lora_request,
             structured_output_request=StructuredOutputRequest(
                 sampling_params=request.sampling_params),
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 0758747a8..a26794561 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,31 +1,41 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import os
+import argparse
+import multiprocessing
 import time
 import weakref
 from collections import defaultdict
 from collections.abc import Sequence
+from dataclasses import dataclass
+from enum import Enum, auto
 from multiprocessing import Process, connection
-from typing import (TYPE_CHECKING, Callable, Generic, Optional, TypeVar, Union,
-                    overload)
+from multiprocessing.process import BaseProcess
+from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar,
+                    Union, overload)
 
+import msgspec
 import torch
+import zmq
 
-from vllm.config import VllmConfig
+from vllm.config import CacheConfig, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
-from vllm.utils import get_mp_context, kill_process_tree
+from vllm.utils import (get_mp_context, get_open_port, get_open_zmq_ipc_path,
+                        get_tcp_uri, kill_process_tree)
 from vllm.v1.executor.abstract import Executor
 
 if TYPE_CHECKING:
     from vllm.attention.layer import Attention
+    from vllm.v1.engine.coordinator import DPCoordinator
 
 logger = init_logger(__name__)
 
 T = TypeVar("T")
 
+STARTUP_POLL_PERIOD_MS = 10000
+
 
 class ConstantList(Generic[T], Sequence):
 
@@ -95,6 +105,78 @@ class ConstantList(Generic[T], Sequence):
         return f"ConstantList({self._x})"
 
 
+def get_engine_client_zmq_addr(local_only: bool,
+                               host: str,
+                               port: int = 0) -> str:
+    return get_open_zmq_ipc_path() if local_only else (get_tcp_uri(
+        host, port or get_open_port()))
+
+
+class APIServerProcessManager:
+    """Manages a group of API server processes.
+    
+    Handles creation, monitoring, and termination of API server worker
+    processes. Also monitors extra processes to check if they are healthy.
+    """
+
+    def __init__(
+        self,
+        target_server_fn: Callable,
+        listen_address: str,
+        sock: Any,
+        args: argparse.Namespace,
+        num_servers: int,
+        input_addresses: list[str],
+        output_addresses: list[str],
+        stats_update_address: Optional[str] = None,
+    ):
+        """Initialize and start API server worker processes.
+        
+        Args:
+            target_server_fn: Function to call for each API server process
+            listen_address: Address to listen for client connections
+            sock: Socket for client connections
+            args: Command line arguments
+            num_servers: Number of API server processes to start
+            input_addresses: Input addresses for each API server
+            output_addresses: Output addresses for each API server
+            stats_update_address: Optional stats update address 
+        """
+        self.listen_address = listen_address
+        self.sock = sock
+        self.args = args
+
+        # Start API servers
+        spawn_context = multiprocessing.get_context("spawn")
+        self.processes: list[BaseProcess] = []
+
+        for i, in_addr, out_addr in zip(range(num_servers), input_addresses,
+                                        output_addresses):
+            client_config = {
+                "input_address": in_addr,
+                "output_address": out_addr,
+                "client_index": i
+            }
+            if stats_update_address is not None:
+                client_config["stats_update_address"] = stats_update_address
+
+            proc = spawn_context.Process(target=target_server_fn,
+                                         name=f"ApiServer_{i}",
+                                         args=(listen_address, sock, args,
+                                               client_config))
+            self.processes.append(proc)
+            proc.start()
+
+        logger.info("Started %d API server processes", len(self.processes))
+
+        # Shutdown only the API server processes on garbage collection
+        # The extra processes are managed by their owners
+        self._finalizer = weakref.finalize(self, shutdown, self.processes)
+
+    def close(self) -> None:
+        self._finalizer()
+
+
 class CoreEngineProcManager:
     """
     Utility class to handle creation, readiness, and shutdown
@@ -109,7 +191,7 @@ class CoreEngineProcManager:
         local_start_index: int,
         vllm_config: VllmConfig,
         on_head_node: bool,
-        input_address: str,
+        handshake_address: str,
         executor_class: type[Executor],
         log_stats: bool,
     ):
@@ -117,12 +199,12 @@ class CoreEngineProcManager:
         common_kwargs = {
             "vllm_config": vllm_config,
             "on_head_node": on_head_node,
-            "input_address": input_address,
+            "handshake_address": handshake_address,
             "executor_class": executor_class,
             "log_stats": log_stats,
         }
 
-        self.processes: list[Process] = []
+        self.processes: list[BaseProcess] = []
         for index in range(local_engine_count):
             local_index = local_start_index + index
             global_index = start_index + index
@@ -135,8 +217,7 @@ class CoreEngineProcManager:
                                     "local_dp_rank": local_index,
                                 }))
 
-        self._finalizer = weakref.finalize(self, shutdown, self.processes,
-                                           input_address)
+        self._finalizer = weakref.finalize(self, shutdown, self.processes)
         try:
             for proc in self.processes:
                 proc.start()
@@ -164,9 +245,199 @@ class CoreEngineProcManager:
         }
 
 
+class CoreEngineState(Enum):
+    NEW = auto()
+    CONNECTED = auto()
+    READY = auto()
+
+
+class CoreEngine:
+    """One per data parallel rank."""
+
+    def __init__(self, index: int = 0, local: bool = True):
+        self.local = local
+        self.index = index
+        self.identity = index.to_bytes(2, "little")
+
+        self.state = CoreEngineState.NEW
+
+
+@dataclass
+class EngineZmqAddresses:
+    # ZMQ input socket addresses for each front-end client (requests)
+    inputs: list[str]
+    # ZMQ output socket addresses for each front-end client (responses)
+    outputs: list[str]
+    # ZMQ input socket address of DP coordinator if applicable
+    coordinator_input: Optional[str] = None
+    # ZMQ output socket address of DP coordinator if applicable
+    coordinator_output: Optional[str] = None
+
+
+@dataclass
+class EngineHandshakeMetadata:
+    """Metadata sent to each engine process during startup handshake,
+    including addresses of the front-end ZMQ queues that they should
+    connect to.
+    """
+    addresses: EngineZmqAddresses
+    parallel_config: dict[str, Union[int, str]]
+
+
+def wait_for_engine_startup(
+    handshake_socket: zmq.Socket,
+    addresses: EngineZmqAddresses,
+    core_engines: list[CoreEngine],
+    parallel_config: ParallelConfig,
+    cache_config: CacheConfig,
+    proc_manager: Optional[CoreEngineProcManager],
+    coord_process: Optional[Process],
+):
+
+    # Wait for engine core process(es) to send ready messages.
+    local_count = parallel_config.data_parallel_size_local
+    remote_count = len(core_engines) - local_count
+    # [local, remote] counts
+    conn_pending, start_pending = [local_count, remote_count], [0, 0]
+    poller = zmq.Poller()
+    poller.register(handshake_socket, zmq.POLLIN)
+
+    if proc_manager is not None:
+        for sentinel in proc_manager.sentinels():
+            poller.register(sentinel, zmq.POLLIN)
+    if coord_process is not None:
+        poller.register(coord_process.sentinel, zmq.POLLIN)
+    while any(conn_pending) or any(start_pending):
+        events = poller.poll(STARTUP_POLL_PERIOD_MS)
+        if not events:
+            if any(conn_pending):
+                logger.debug(
+                    "Waiting for %d local, %d remote core engine proc(s) "
+                    "to connect.", *conn_pending)
+            if any(start_pending):
+                logger.debug(
+                    "Waiting for %d local, %d remote core engine proc(s) "
+                    "to start.", *start_pending)
+            continue
+        if len(events) > 1 or events[0][0] != handshake_socket:
+            # One of the local core processes exited.
+            finished = proc_manager.finished_procs() if proc_manager else {}
+            if coord_process is not None and coord_process.exitcode is not None:
+                finished[coord_process.name] = coord_process.exitcode
+            raise RuntimeError("Engine core initialization failed. "
+                               "See root cause above. "
+                               f"Failed core proc(s): {finished}")
+
+        # Receive HELLO and READY messages from the input socket.
+        eng_identity, ready_msg_bytes = handshake_socket.recv_multipart()
+        eng_index = int.from_bytes(eng_identity, "little")
+        engine = next((e for e in core_engines if e.identity == eng_identity),
+                      None)
+        if engine is None:
+            raise RuntimeError(f"Message from engine with unexpected data "
+                               f"parallel rank: {eng_index}")
+        msg = msgspec.msgpack.decode(ready_msg_bytes)
+        status, local = msg["status"], msg["local"]
+        if local != engine.local:
+            raise RuntimeError(f"{status} message from "
+                               f"{'local' if local else 'remote'} "
+                               f"engine {eng_index}, expected it to be "
+                               f"{'local' if engine.local else 'remote'}")
+
+        if status == "HELLO" and engine.state == CoreEngineState.NEW:
+
+            # Send init message with DP config info.
+            init_message = msgspec.msgpack.encode(
+                EngineHandshakeMetadata(
+                    addresses=addresses,
+                    parallel_config={
+                        "data_parallel_master_ip":
+                        parallel_config.data_parallel_master_ip,
+                        "data_parallel_master_port":
+                        parallel_config.data_parallel_master_port,
+                        "data_parallel_size":
+                        parallel_config.data_parallel_size,
+                    }))
+            handshake_socket.send_multipart((eng_identity, init_message),
+                                            copy=False)
+            conn_pending[0 if local else 1] -= 1
+            start_pending[0 if local else 1] += 1
+            engine.state = CoreEngineState.CONNECTED
+        elif status == "READY" and (engine.state == CoreEngineState.CONNECTED):
+            # Setup KV cache config with initialization state from
+            # engine core process. Sum values from all engines in DP case.
+            num_gpu_blocks = cache_config.num_gpu_blocks or 0
+            num_gpu_blocks += msg["num_gpu_blocks"]
+            cache_config.num_gpu_blocks = num_gpu_blocks
+
+            start_pending[0 if local else 1] -= 1
+            engine.state = CoreEngineState.READY
+        else:
+            raise RuntimeError(f"Unexpected {status} message for "
+                               f"{'local' if local else 'remote'} engine "
+                               f"{eng_index} in {engine.state} state.")
+
+        logger.debug("%s from %s core engine process %s.", status,
+                     "local" if local else "remote", eng_index)
+
+
+def wait_for_completion_or_failure(
+        api_server_manager: APIServerProcessManager,
+        local_engine_manager: Optional[CoreEngineProcManager] = None,
+        coordinator: Optional["DPCoordinator"] = None) -> None:
+    """Wait for all processes to complete or detect if any fail.
+    
+    Raises an exception if any process exits with a non-zero status.
+    """
+
+    try:
+        logger.info("Waiting for API servers to complete ...")
+        # Create a mapping of sentinels to their corresponding processes
+        # for efficient lookup
+        sentinel_to_proc: dict[Any, BaseProcess] = {
+            proc.sentinel: proc
+            for proc in api_server_manager.processes
+        }
+
+        if coordinator:
+            sentinel_to_proc[coordinator.proc.sentinel] = coordinator.proc
+
+        if local_engine_manager:
+            for proc in local_engine_manager.processes:
+                sentinel_to_proc[proc.sentinel] = proc
+
+        # Check if any process terminates
+        while sentinel_to_proc:
+            # Wait for any process to terminate
+            ready_sentinels: list[Any] = connection.wait(sentinel_to_proc)
+
+            # Process any terminated processes
+            for sentinel in ready_sentinels:
+                proc = sentinel_to_proc.pop(sentinel)
+
+                # Check if process exited with error
+                if proc.exitcode != 0:
+                    raise RuntimeError(
+                        f"Process {proc.name} (PID: {proc.pid}) "
+                        f"died with exit code {proc.exitcode}")
+    except KeyboardInterrupt:
+        logger.info("Received KeyboardInterrupt, shutting down API servers...")
+    except Exception as e:
+        logger.exception("Exception occurred while running API servers: %s",
+                         str(e))
+        raise
+    finally:
+        logger.info("Terminating remaining processes ...")
+        api_server_manager.close()
+        if coordinator:
+            coordinator.close()
+        if local_engine_manager:
+            local_engine_manager.close()
+
+
 # Note(rob): shutdown function cannot be a bound method,
-# else the gc cannot collect the objedecoupct.
-def shutdown(procs: list[Process], input_address: str):
+# else the gc cannot collect the object.
+def shutdown(procs: list[BaseProcess]):
     # Shutdown the process.
     for proc in procs:
         if proc.is_alive():
@@ -185,12 +456,6 @@ def shutdown(procs: list[Process], input_address: str):
         if proc.is_alive() and (pid := proc.pid) is not None:
             kill_process_tree(pid)
 
-    # Remove zmq ipc socket files.
-    if input_address.startswith("ipc://"):
-        socket_file = input_address[len("ipc://"):]
-        if os and os.path.exists(socket_file):
-            os.remove(socket_file)
-
 
 def bind_kv_cache(
     kv_caches: dict[str, torch.Tensor],
-- 
GitLab


From f49239cb454b0ab1169fadd6a700812f33a0eafa Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 30 May 2025 12:56:11 -0400
Subject: [PATCH 081/274] Benchmark script for fp8 vs bf16 gemm (#17126)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 benchmarks/kernels/bench_fp8_gemm.py | 222 +++++++++++++++++++++++++++
 benchmarks/kernels/weight_shapes.py  |  46 ++++++
 2 files changed, 268 insertions(+)
 create mode 100644 benchmarks/kernels/bench_fp8_gemm.py

diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py
new file mode 100644
index 000000000..36d03e40e
--- /dev/null
+++ b/benchmarks/kernels/bench_fp8_gemm.py
@@ -0,0 +1,222 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import copy
+import itertools
+
+import torch
+import triton
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
+from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+        x_log=False,
+        line_arg="provider",
+        line_vals=[
+            "torch-bf16",
+            # "fp8-tensor-w-token-a",
+            "fp8-tensor-w-tensor-a",
+            "fp8-channel-w-token-a",
+            # "fp8-channel-w-tensor-a",
+            # "fp8-tensor-w-token-a-noquant",
+            "fp8-tensor-w-tensor-a-noquant",
+            "fp8-channel-w-token-a-noquant",
+            # "fp8-channel-w-tensor-a-noquant",
+        ],
+        line_names=[
+            "torch-bf16",
+            # "fp8-tensor-w-token-a",
+            "fp8-tensor-w-tensor-a",
+            "fp8-channel-w-token-a",
+            # "fp8-channel-w-tensor-a",
+            # "fp8-tensor-w-token-a-noquant",
+            "fp8-tensor-w-tensor-a-noquant",
+            "fp8-channel-w-token-a-noquant",
+            # "fp8-channel-w-tensor-a-noquant",
+        ],
+        ylabel="TFLOP/s (larger is better)",
+        plot_name="BF16 vs FP8 GEMMs",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    M = batch_size
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    # Create input tensors
+    a = torch.randn((M, K), device=device, dtype=dtype)
+    b = torch.randn((N, K), device=device, dtype=dtype)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if "torch-bf16" in provider:
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
+        )
+
+    elif "fp8" in provider:
+        # Weights are always quantized ahead of time
+        if "noquant" in provider:
+            # For no quantization, we just measure the GEMM
+            if "tensor-w-token-a" in provider:
+                # Dynamic per-token quant for A, per-tensor quant for B
+                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
+                assert scale_b_fp8.numel() == 1
+                a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
+                    a, use_per_token_if_dynamic=True
+                )
+
+                def run_quant():
+                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+            elif "tensor-w-tensor-a" in provider:
+                # Static per-tensor quantization with fixed scales
+                # for both A and B
+                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
+                scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
+                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+                assert scale_b_fp8.numel() == 1
+                a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
+
+                def run_quant():
+                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+            elif "channel-w-token-a" in provider:
+                # Static per-channel quantization for weights, per-token
+                # quant for A
+                scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
+                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+                scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
+                assert scale_b_fp8.numel() == N
+                a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
+                    a, use_per_token_if_dynamic=True
+                )
+
+                def run_quant():
+                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+            elif "channel-w-tensor-a" in provider:
+                # Static per-channel quantization for weights, per-tensor
+                # quant for A
+                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
+                scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
+                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+                scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
+                assert scale_b_fp8.numel() == N
+                a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
+
+                def run_quant():
+                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+        else:
+            # In these cases, we quantize the activations during the GEMM call
+            if "tensor-w-token-a" in provider:
+                # Dynamic per-token quant for A, per-tensor quant for B
+                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
+                assert scale_b_fp8.numel() == 1
+
+                def run_quant():
+                    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
+                        a, use_per_token_if_dynamic=True
+                    )
+                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+            elif "tensor-w-tensor-a" in provider:
+                # Static per-tensor quantization with fixed scales
+                # for both A and B
+                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
+                scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
+                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+                assert scale_b_fp8.numel() == 1
+
+                def run_quant():
+                    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
+                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+            elif "channel-w-token-a" in provider:
+                # Static per-channel quantization for weights, per-token
+                # quant for A
+                scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
+                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+                scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
+                assert scale_b_fp8.numel() == N
+
+                def run_quant():
+                    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
+                        a, use_per_token_if_dynamic=True
+                    )
+                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+            elif "channel-w-tensor-a" in provider:
+                # Static per-channel quantization for weights, per-tensor
+                # quant for A
+                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
+                scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
+                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+                scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
+                assert scale_b_fp8.numel() == N
+
+                def run_quant():
+                    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
+                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+        b_fp8 = b_fp8.t()
+
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: run_quant(), quantiles=quantiles
+        )
+
+    # Calculate TFLOP/s, two flops per multiply-add
+    tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3)
+    return tflops(ms), tflops(max_ms), tflops(min_ms)
+
+
+def prepare_shapes(args):
+    KN_model_names = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        assert model in WEIGHT_SHAPES
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KN.append(model)
+            KN_model_names.append(KN)
+    return KN_model_names
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        choices=[*WEIGHT_SHAPES.keys()],
+        help="List of models to benchmark",
+    )
+    parser.add_argument(
+        "--tp-sizes",
+        nargs="+",
+        type=int,
+        default=[1],
+        help="List of tensor parallel sizes",
+    )
+    args = parser.parse_args()
+
+    KN_model_names = prepare_shapes(args)
+    for K, N, model_name in KN_model_names:
+        print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
+        benchmark.run(
+            print_data=True,
+            show_plots=True,
+            save_path=f"bench_fp8_res_n{N}_k{K}",
+            N=N,
+            K=K,
+        )
+
+    print("Benchmark finished!")
diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py
index 89b05d588..afe159ddd 100644
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -48,4 +48,50 @@ WEIGHT_SHAPES = {
         ([16384, 106496], 1),
         ([53248, 16384], 0),
     ],
+    "meta-llama/Llama-3.1-8B-Instruct": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-3.3-70B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "mistralai/Mistral-Large-Instruct-2407": [
+        ([12288, 14336], 1),
+        ([12288, 12288], 0),
+        ([12288, 57344], 1),
+        ([28672, 12288], 0),
+    ],
+    "Qwen/Qwen2.5-7B-Instruct": [
+        ([3584, 4608], 1),
+        ([3584, 3584], 0),
+        ([3584, 37888], 1),
+        ([18944, 3584], 0),
+    ],
+    "Qwen/Qwen2.5-32B-Instruct": [
+        ([5120, 7168], 1),
+        ([5120, 5120], 0),
+        ([5120, 55296], 1),
+        ([27648, 5120], 0),
+    ],
+    "Qwen/Qwen2.5-72B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 59136], 1),
+        ([29568, 8192], 0),
+    ],
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
+        ([2048, 3072], 1),
+        ([2048, 4096], 1),
+        ([2048, 2048], 0),
+        ([2048, 576], 0),
+        ([2048, 21888], 1),
+        ([10944, 2048], 0),
+        ([2048, 2816], 1),
+        ([1408, 2048], 0),
+    ],
 }
-- 
GitLab


From 5a8641638a5ec6cf089490d6b7f474b135ff590e Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 31 May 2025 01:11:44 +0800
Subject: [PATCH 082/274] [VLM] Add PP support and fix GPTQ inference for Ovis
 models (#18958)

Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
---
 docs/models/supported_models.md             |   2 +-
 tests/distributed/test_pipeline_parallel.py |   1 +
 vllm/model_executor/models/aimv2.py         | 194 ++++++++++++--------
 vllm/model_executor/models/clip.py          |   5 -
 vllm/model_executor/models/ovis.py          |  34 ++--
 5 files changed, 145 insertions(+), 91 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 6b0ceaf21..b60fefdda 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -538,7 +538,7 @@ Specified using `--task generate`.
 | `MllamaForConditionalGeneration`             | Llama 3.2                                                                | T + I<sup>+</sup>                                                     | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.                                                                     |                        |                             |                        |
 | `MolmoForCausalLM`                           | Molmo                                                                    | T + I<sup>+</sup>                                                     | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc.                                                                                              | ✅︎                     | ✅︎                          | ✅︎                    |
 | `NVLM_D_Model`                               | NVLM-D 1.0                                                               | T + I<sup>+</sup>                                                     | `nvidia/NVLM-D-72B`, etc.                                                                                                                               |                       | ✅︎                          | ✅︎                      |
-| `Ovis`                                       | Ovis2, Ovis1.6                                                           | T + I<sup>+</sup>                                                     | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc.                                                                                                 |                       |                             | ✅︎                      |
+| `Ovis`                                       | Ovis2, Ovis1.6                                                           | T + I<sup>+</sup>                                                     | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc.                                                                                                 |                       | ✅︎                          | ✅︎                      |
 | `PaliGemmaForConditionalGeneration`          | PaliGemma, PaliGemma 2                                                   | T + I<sup>E</sup>                                                     | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.                                                  |                       | ✅︎                          | ⚠️                       |
 | `Phi3VForCausalLM`                           | Phi-3-Vision, Phi-3.5-Vision                                             | T + I<sup>E+</sup>                                                    | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.                                                                       |                      | ✅︎                          |  ✅︎                     |
 | `Phi4MMForCausalLM`                          | Phi-4-multimodal                                                         | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc.                                                                                                             | ✅︎                     | ✅︎                          | ✅︎                       |
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 5346d67b1..e6410ab06 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -227,6 +227,7 @@ MULTIMODAL_MODELS = {
     "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
     "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
     "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
+    "AIDC-AI/Ovis2-1B": PPTestSettings.fast(),
     "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(),
     "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
     "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py
index aefd6c973..2e2a18abd 100644
--- a/vllm/model_executor/models/aimv2.py
+++ b/vllm/model_executor/models/aimv2.py
@@ -2,16 +2,23 @@
 
 # A modified implementation of the AIMv2 Transformer
 # inserted here also the image tokenizer used by Ovis2
+from collections.abc import Iterable
 from typing import Optional
 
 import torch
 import torch.nn as nn
-from torch.nn import functional as F
 
+from vllm.attention.layer import MultiHeadAttention
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.utils import divide
+from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.transformers_utils.configs.ovis import AIMv2Config
 
 
@@ -24,29 +31,27 @@ class AIMv2SwiGLUFFN(nn.Module):
         in_features = config.hidden_size
         bias = config.use_bias
 
-        # TODO(Isotr0py): investigate if we can add TP to visual tokenizer
-        self.fc1 = ReplicatedLinear(in_features,
-                                    hidden_features,
-                                    bias=bias,
-                                    quant_config=quant_config,
-                                    prefix=f"{prefix}.fc1")
-        self.fc2 = ReplicatedLinear(hidden_features,
-                                    in_features,
-                                    bias=bias,
-                                    quant_config=quant_config,
-                                    prefix=f"{prefix}.fc2")
-        self.fc3 = ReplicatedLinear(in_features,
-                                    hidden_features,
-                                    bias=bias,
-                                    quant_config=quant_config,
-                                    prefix=f"{prefix}.fc3")
+        self.fc13 = MergedColumnParallelLinear(
+            in_features,
+            [hidden_features] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc13",
+        )
+        self.fc2 = RowParallelLinear(
+            input_size=hidden_features,
+            output_size=in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+        self.act_fn = SiluAndMul()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x_parallel, _ = self.fc1(x)
-        gate, _ = self.fc3(x)
-        x_parallel = F.silu(x_parallel) * gate
-        out, _ = self.fc2(x_parallel)
-        return out
+        x, _ = self.fc13(x)
+        x = self.act_fn(x)
+        x, _ = self.fc2(x)
+        return x
 
 
 class AIMv2PatchEmbed(nn.Module):
@@ -90,39 +95,45 @@ class AIMv2Attention(nn.Module):
     def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
                  prefix: str):
         super().__init__()
-        dim = config.hidden_size
-
-        # TODO(Isotr0py): investigate if we can add TP to visual tokenizer
+        self.config = config
+        self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
-        self.qkv = ReplicatedLinear(dim, dim * 3, bias=config.qkv_bias)
-        # self.qkv = QKVParallelLinear(
-        #               hidden_size=dim,
-        #               head_size=dim // config.num_attention_heads,
-        #               total_num_heads=config.num_attention_heads,
-        #               bias=config.qkv_bias,
-        #               quant_config=quant_config,
-        #               prefix=f"{prefix}.qkv")
-        self.proj = ReplicatedLinear(dim, dim, bias=config.use_bias)
-        # self.proj = RowParallelLinear(input_size=dim,
-        #                  output_size=dim,
-        #                  bias = config.use_bias,
-        #                  quant_config=quant_config,
-        #                  prefix=f"{prefix}.proj")
-
-    def forward(  # todo might implement multiple attn implementations
-            self,
-            x: torch.Tensor,
-            mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-        B, N, C = x.shape
-        qkv, _ = self.qkv(x)
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                "embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=self.embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.num_heads,
+            bias=config.qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+        )
+
+        self.proj = RowParallelLinear(
+            input_size=self.embed_dim,
+            output_size=self.embed_dim,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+        )
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        qkv = qkv.reshape(B, N, 3, self.num_heads,
-                          C // self.num_heads).permute(2, 0, 3, 1, 4)
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
 
-        q, k, v = qkv.unbind(0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        qkv, _ = self.qkv(x)
+        q, k, v = qkv.chunk(3, dim=-1)
 
-        x = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
-        x = x.transpose(1, 2).contiguous().reshape(B, N, C)
+        x = self.attn(q, k, v)
         x, _ = self.proj(x)
         return x
 
@@ -141,37 +152,40 @@ class AIMv2Block(nn.Module):
                                   prefix=f"{prefix}.mlp")
         self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def forward(self,
-                x: torch.Tensor,
-                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-        x = x + self.attn(self.norm_1.forward_native(x), mask)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.norm_1.forward_native(x))
         x = x + self.mlp(self.norm_2.forward_native(x))
         return x
 
 
 class AIMv2Transformer(nn.Module):
 
-    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
-                 prefix: str):
+    def __init__(
+        self,
+        config: AIMv2Config,
+        quant_config: QuantizationConfig,
+        *,
+        require_post_norm: Optional[bool] = None,
+        prefix: str = "",
+    ):
         super().__init__()
 
         self.blocks = nn.ModuleList([
             AIMv2Block(config, quant_config, prefix=f"{prefix}.blocks.{i}")
             for i in range(config.num_hidden_layers)
         ])
-        self.post_trunk_norm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
+        if require_post_norm:
+            self.post_trunk_norm = RMSNorm(config.hidden_size,
+                                           eps=config.rms_norm_eps)
+        else:
+            self.post_trunk_norm = None
 
-    def forward(
-        self,
-        tokens: torch.Tensor,
-        mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    def forward(self, tokens: torch.Tensor) -> torch.Tensor:
         # they take the -1 as the ref embeddings, like a clip skip
         for block in self.blocks:
-            tokens = block(tokens, mask)
-        # NO NORM IN THE OG IMPLEMENTATION
-        # tokens = self.post_trunk_norm(tokens)
+            tokens = block(tokens)
+        if self.post_trunk_norm is not None:
+            tokens = self.post_trunk_norm(tokens)
         return tokens
 
 
@@ -180,20 +194,52 @@ class AIMv2Model(torch.nn.Module):
     def __init__(self,
                  config: AIMv2Config,
                  quant_config: QuantizationConfig,
+                 *,
+                 require_post_norm: Optional[bool] = None,
                  prefix: str = ""):
         super().__init__()
         self.preprocessor = AIMv2ViTPreprocessor(config)
         self.trunk = AIMv2Transformer(config,
                                       quant_config=quant_config,
+                                      require_post_norm=require_post_norm,
                                       prefix=f"{prefix}.trunk")
 
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
 
         x = self.preprocessor(pixel_values)
-        x = self.trunk(x, mask)
+        x = self.trunk(x)
 
         return x
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".fc13", ".fc1", 0),
+            (".fc13", ".fc3", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            # post_layernorm is optional in SiglipVisionModel
+            if (name.startswith("trunk.post_trunk_norm")
+                    and self.trunk.post_trunk_norm is None):
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index e8f3ae215..9fd528fd7 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -106,7 +106,6 @@ class CLIPAttention(nn.Module):
                 f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
                 f" {self.num_heads}).")
         self.scale = self.head_dim**-0.5
-        self.dropout = config.attention_dropout
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size=self.embed_dim,
@@ -129,10 +128,6 @@ class CLIPAttention(nn.Module):
         self.attn = MultiHeadAttention(self.num_heads_per_partition,
                                        self.head_dim, self.scale)
 
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads,
-                           self.head_dim).transpose(1, 2).contiguous()
-
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index e03705d48..232a63c50 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -30,6 +30,9 @@ from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
 from vllm.model_executor.models.aimv2 import AIMv2Model
 from vllm.model_executor.models.siglip import SiglipVisionModel
 from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn,
@@ -48,7 +51,7 @@ from vllm.transformers_utils.configs.ovis import (BaseVisualTokenizerConfig,
                                                   OvisConfig)
 from vllm.transformers_utils.processors.ovis import OvisProcessor
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import merge_multimodal_embeddings
 
 # Cannot find the following number from hf config.
@@ -106,12 +109,14 @@ class VisualTokenizer(torch.nn.Module):
         config: BaseVisualTokenizerConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
-    ):
+    ) -> nn.Module:
         model_type = config.backbone_config.model_type
         if model_type == "aimv2":
+            # No post rms_norm in Ovis2's AIMv2 ViT.
             return AIMv2Model(
                 config=config.backbone_config,
                 quant_config=quant_config,
+                require_post_norm=False,
                 prefix=prefix,
             )
         elif model_type == "siglip_vision_model":
@@ -124,14 +129,14 @@ class VisualTokenizer(torch.nn.Module):
             f"Unsupported visual tokenizer model_type: {model_type}")
 
     @property
-    def dtype(self):
+    def dtype(self) -> torch.dtype:
         return next(self.head.parameters()).dtype
 
     @property
-    def device(self):
+    def device(self) -> torch.device:
         return next(self.head.parameters()).device
 
-    def tokenize(self, logits):
+    def tokenize(self, logits: torch.Tensor) -> torch.Tensor:
         if self.config.tokenize_function == 'softmax':
             tokens = softmax(logits, dim=-1)
         elif self.config.tokenize_function == 'gumbel_argmax':
@@ -144,7 +149,7 @@ class VisualTokenizer(torch.nn.Module):
                 f'or st_argmax, but got {self.config.tokenize_function}')
         return tokens
 
-    def encode(self, pixel_values):
+    def encode(self, pixel_values: torch.Tensor) -> torch.Tensor:
         features = self.backbone(pixel_values)
         if self.config.drop_cls_token:
             features = features[:, 1:, :]
@@ -395,7 +400,7 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
 @MULTIMODAL_REGISTRY.register_processor(OvisMultiModalProcessor,
                                         info=OvisProcessingInfo,
                                         dummy_inputs=OvisDummyInputsBuilder)
-class Ovis(nn.Module, SupportsMultiModal):
+class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -410,7 +415,7 @@ class Ovis(nn.Module, SupportsMultiModal):
 
         self.visual_tokenizer = VisualTokenizer(
             config=config.visual_tokenizer_config,
-            quant_config=quant_config,
+            quant_config=self._maybe_ignore_quant_config(quant_config),
             prefix=f"{prefix}.visual_tokenizer",
         )
 
@@ -421,9 +426,16 @@ class Ovis(nn.Module, SupportsMultiModal):
         text_model_type = self.config.get_text_config().model_type
         self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type]
 
-        # TODO(Isotr0py): PP support
-        # self.make_empty_intermediate_tensors = (
-        #    self.language_model.make_empty_intermediate_tensors)
+        self.make_empty_intermediate_tensors = (
+            self.get_language_model().make_empty_intermediate_tensors)
+
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
+        # seems to avoid vision encoder sections for some models.
+        # See: https://huggingface.co/AIDC-AI/Ovis2-2B-GPTQ-Int4
+        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return quant_config
 
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[OvisImagePatchInputs]:
-- 
GitLab


From 7f21e8052b5f3948c8a59514a8dc1e9c5eef70d6 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Sat, 31 May 2025 01:34:22 +0800
Subject: [PATCH 083/274] [Misc] add group_size is -1 in awq quantization
 (#18910)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 vllm/model_executor/layers/quantization/awq.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 4660c28c8..87afdb623 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -101,7 +101,13 @@ class AWQLinearMethod(LinearMethodBase):
                        output_partition_sizes: list[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
-        if input_size_per_partition % self.quant_config.group_size != 0:
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        if input_size_per_partition % group_size != 0:
             raise ValueError(
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
@@ -127,9 +133,11 @@ class AWQLinearMethod(LinearMethodBase):
             packed_factor=self.quant_config.pack_factor,
             weight_loader=weight_loader)
 
+        num_groups = input_size_per_partition // group_size
+
         qzeros = PackedvLLMParameter(
             data=torch.empty(
-                input_size_per_partition // self.quant_config.group_size,
+                num_groups,
                 output_size_per_partition // self.quant_config.pack_factor,
                 dtype=torch.int32,
             ),
@@ -140,7 +148,7 @@ class AWQLinearMethod(LinearMethodBase):
             weight_loader=weight_loader)
 
         scales = GroupQuantScaleParameter(data=torch.empty(
-            input_size_per_partition // self.quant_config.group_size,
+            num_groups,
             output_size_per_partition,
             dtype=params_dtype,
         ),
-- 
GitLab


From 1dab4d5718243687acd9edc0ccaa0c94ddc43934 Mon Sep 17 00:00:00 2001
From: Will Eaton <wseaton@users.noreply.github.com>
Date: Fri, 30 May 2025 17:02:54 -0400
Subject: [PATCH 084/274] Tool parser regex timeout handling (#18960)

Signed-off-by: Will Eaton <weaton@redhat.com>
---
 .../test_llama4_pythonic_tool_parser.py       | 26 ++++++++++++++++++-
 .../tool_parsers/test_pythonic_tool_parser.py | 26 ++++++++++++++++++-
 .../llama4_pythonic_tool_parser.py            | 15 ++++++++++-
 .../tool_parsers/pythonic_tool_parser.py      | 15 +++++++++--
 vllm/envs.py                                  |  5 ++++
 5 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
index 92ba1376e..f5f327ea0 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
 
 import pytest
 
@@ -191,3 +191,27 @@ def test_streaming_tool_call_with_large_steps():
     assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
     assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
     assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
+
+
+@pytest.mark.parametrize("streaming", [False])
+def test_regex_timeout_handling(streaming: bool):
+    """test regex timeout is handled gracefully"""
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+
+    fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
+
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.match.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex):
+        content, tool_calls = run_tool_extraction(tool_parser,
+                                                  fake_problematic_input,
+                                                  streaming=streaming)
+
+        # should treat as regular text when regex times out
+        assert content == fake_problematic_input
+        assert len(tool_calls) == 0
+        mock_regex.match.assert_called_once()
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
index fbbbc1fb2..71f41ea7d 100644
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
 
 import pytest
 
@@ -159,3 +159,27 @@ def test_streaming_tool_call_with_large_steps():
     assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
     assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
     assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
+
+
+@pytest.mark.parametrize("streaming", [False])
+def test_regex_timeout_handling(streaming: bool):
+    """test regex timeout is handled gracefully"""
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+
+    fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
+
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.match.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex):
+        content, tool_calls = run_tool_extraction(tool_parser,
+                                                  fake_problematic_input,
+                                                  streaming=streaming)
+
+        # should treat as regular text when regex times out
+        assert content == fake_problematic_input
+        assert len(tool_calls) == 0
+        mock_regex.match.assert_called_once()
diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
index 858c8db99..323fb1441 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
@@ -7,6 +7,7 @@ from typing import Any, Union
 import regex as re
 from transformers import PreTrainedTokenizerBase
 
+import vllm.envs as envs
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
@@ -64,7 +65,19 @@ class Llama4PythonicToolParser(ToolParser):
         if model_output.startswith("<|python_start|>"):
             model_output = model_output[len("<|python_start|>"):]
             model_output = model_output.replace("<|python_end|>", "")
-        if not (self.TOOL_CALL_REGEX.match(model_output)):
+
+        is_tool_call_pattern = False
+        try:
+            is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
+                model_output,
+                timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
+        except TimeoutError:
+            logger.warning(
+                "Regex timeout occurred when matching tool call pattern.")
+            logger.debug("Regex timeout occurred when matching user input: %s",
+                         model_output)
+
+        if not is_tool_call_pattern:
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
                                                 content=model_output)
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index 548ff39d1..bc5d15dcb 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -8,6 +8,7 @@ from typing import Any, Union
 import regex as re
 from transformers import PreTrainedTokenizerBase
 
+import vllm.envs as envs
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
@@ -61,8 +62,18 @@ class PythonicToolParser(ToolParser):
         """
         Extract the tool calls from a complete model response.
         """
-
-        if not (self.TOOL_CALL_REGEX.match(model_output)):
+        is_tool_call_pattern = False
+        try:
+            is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
+                model_output,
+                timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
+        except TimeoutError:
+            logger.warning(
+                "Regex timeout occurred when matching tool call pattern.")
+            logger.debug("Regex timeout occurred when matching user input: %s",
+                         model_output)
+
+        if not is_tool_call_pattern:
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
                                                 content=model_output)
diff --git a/vllm/envs.py b/vllm/envs.py
index dc52bbd8e..44baf5a18 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -119,6 +119,7 @@ if TYPE_CHECKING:
     VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
     VLLM_ALL2ALL_BACKEND: str = "naive"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
+    VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
 
 
 def get_default_cache_root():
@@ -828,6 +829,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # This is used to prevent the kernel from running out of memory.
     "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE":
     lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")),
+
+    # Regex timeout for use by the vLLM tool parsing plugins.
+    "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS":
+    lambda: int(os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")),
 }
 
 # --8<-- [end:env-vars-definition]
-- 
GitLab


From 0f71e24034263363e48cdc6ae036e7ca057a4f44 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Sat, 31 May 2025 02:30:15 +0100
Subject: [PATCH 085/274] [Docs] Correct multiprocessing design doc (#18964)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 docs/design/multiprocessing.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/design/multiprocessing.md b/docs/design/multiprocessing.md
index 412c42fd5..4d58fae20 100644
--- a/docs/design/multiprocessing.md
+++ b/docs/design/multiprocessing.md
@@ -22,13 +22,13 @@ This document describes how vLLM deals with these challenges.
 
 [Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
 
-- `spawn` - spawn a new Python process. This will be the default as of Python
-  3.14. In macOS, this is already the default.
+- `spawn` - spawn a new Python process. The default on Windows and macOS.
 
-- `fork` - Use `os.fork()` to fork the Python interpreter. This is the default
-  in Python versions prior to 3.14.
+- `fork` - Use `os.fork()` to fork the Python interpreter. The default on
+  Linux for Python versions prior to 3.14.
 
 - `forkserver` - Spawn a server process that will fork a new process on request.
+  The default on Linux for Python version 3.14 and newer.
 
 ### Tradeoffs
 
-- 
GitLab


From 7782464a1714f6081ca06f47b75e824b14316c72 Mon Sep 17 00:00:00 2001
From: Yu Guo <82124926+yuguo68@users.noreply.github.com>
Date: Fri, 30 May 2025 22:50:38 -0700
Subject: [PATCH 086/274] create util function for batched arange (#18937)

---
 vllm/v1/worker/gpu_model_runner.py | 64 ++++++++++++++++--------------
 1 file changed, 35 insertions(+), 29 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 60425a4e1..b6fa68ab0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -500,6 +500,26 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         if batch_changed or batch_reordered:
             self.input_batch.refresh_sampling_metadata()
 
+    def _get_cumsum_and_arange(
+        self,
+        num_tokens: np.ndarray,
+        cumsum_dtype: Optional[np.dtype] = None,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """Get the cumulative sum and batched arange of the given array.
+        # E.g., [2, 5, 3] -> ([2, 7, 10], [0, 1, 0, 1, 2, 3, 4, 0, 1, 2])
+        # Equivalent to but faster than:
+        # np.concatenate([np.arange(n) for n in num_tokens])
+        """
+        # Step 1. [2, 5, 3] -> [2, 7, 10]
+        cu_num_tokens = np.cumsum(num_tokens, dtype=cumsum_dtype)
+        total_num_tokens = cu_num_tokens[-1]
+        # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
+        cumsums_offsets = np.repeat(cu_num_tokens - num_tokens, num_tokens)
+        # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        arange = self.arange_np[:total_num_tokens] - cumsums_offsets
+
+        return cu_num_tokens, arange
+
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
@@ -525,17 +545,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         req_indices = np.repeat(self.arange_np[:num_reqs],
                                 num_scheduled_tokens)
 
-        # Get batched arange.
-        # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        # Equivalent to but faster than:
-        # np.concatenate([np.arange(n) for n in num_scheduled_tokens])
-        # Step 1. [2, 5, 3] -> [2, 7, 10]
-        cu_num_tokens = np.cumsum(num_scheduled_tokens)
-        # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
-        cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens,
-                                    num_scheduled_tokens)
-        # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets
+        # cu_num_tokens: [2, 5, 3] -> [2, 7, 10]
+        # arange: [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        cu_num_tokens, arange = self._get_cumsum_and_arange(
+            num_scheduled_tokens)
 
         # Get positions.
         positions_np = self.positions_np[:total_num_scheduled_tokens]
@@ -841,32 +854,25 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # Compute the logits indices.
         # [4, 1, 3, 1, 2]
         num_sampled_tokens = num_draft_tokens + 1
-        # Step 1. [4, 5, 8, 9, 11]
-        cu_num_sampled_tokens = np.cumsum(num_sampled_tokens, dtype=np.int32)
-        total_num_sampled_tokens = cu_num_sampled_tokens[-1]
-        # Step 2. [0, 0, 0, 0, 4, 5, 5, 5, 8, 9, 9]
-        cumsums_offsets = np.repeat(cu_num_sampled_tokens - num_sampled_tokens,
-                                    num_sampled_tokens)
-        # Step 3. [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
-        arange = self.arange_np[:total_num_sampled_tokens] - cumsums_offsets
-        # Step 4. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207]
+
+        # Step 1. cu_num_sampled_tokens: [4, 5, 8, 9, 11]
+        # arange: [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
+        cu_num_sampled_tokens, arange = self._get_cumsum_and_arange(
+            num_sampled_tokens, cumsum_dtype=np.int32)
+        # Step 2. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207]
         logits_indices = np.repeat(
             cu_num_scheduled_tokens - num_sampled_tokens, num_sampled_tokens)
-        # Step 5. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
+        # Step 3. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
         logits_indices += arange
 
         # Compute the bonus logits indices.
         bonus_logits_indices = cu_num_sampled_tokens - 1
 
         # Compute the draft logits indices.
-        # [3, 3, 5, 5, 6]
-        cu_num_draft_tokens = np.cumsum(num_draft_tokens, dtype=np.int32)
-        total_num_draft_tokens = cu_num_draft_tokens[-1]
-        # [0, 0, 0, 3, 3, 5]
-        cumsums_offsets = np.repeat(cu_num_draft_tokens - num_draft_tokens,
-                                    num_draft_tokens)
-        # [0, 1, 2, 0, 1, 0]
-        arange = self.arange_np[:total_num_draft_tokens] - cumsums_offsets
+        # cu_num_draft_tokens: [3, 3, 5, 5, 6]
+        # arange: [0, 1, 2, 0, 1, 0]
+        cu_num_draft_tokens, arange = self._get_cumsum_and_arange(
+            num_draft_tokens, cumsum_dtype=np.int32)
         # [0, 0, 0, 5, 5, 9]
         target_logits_indices = np.repeat(
             cu_num_sampled_tokens - num_sampled_tokens, num_draft_tokens)
-- 
GitLab


From dff80b0e42d5d2deebc0a6b0e3d2f6f06bd01c78 Mon Sep 17 00:00:00 2001
From: Pooya Davoodi <pooya.davoodi@parasail.io>
Date: Sat, 31 May 2025 00:40:01 -0700
Subject: [PATCH 087/274] [Frontend] Add rerank support to run_batch endpoint
 (#16278)

Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
---
 tests/entrypoints/openai/test_run_batch.py | 14 +++++++--
 vllm/entrypoints/openai/protocol.py        | 17 +++++++----
 vllm/entrypoints/openai/run_batch.py       | 33 +++++++++++++++++-----
 3 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index 27802945a..99639ce51 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -4,6 +4,8 @@ import json
 import subprocess
 import tempfile
 
+import pytest
+
 from vllm.entrypoints.openai.protocol import BatchRequestOutput
 
 # ruff: noqa: E501
@@ -23,9 +25,13 @@ INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "
 {"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}
 {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
 
-INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
 
+INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
+
 
 def test_empty_file():
     with tempfile.NamedTemporaryFile(
@@ -105,11 +111,13 @@ def test_embeddings():
             BatchRequestOutput.model_validate_json(line)
 
 
-def test_score():
+@pytest.mark.parametrize("input_batch",
+                         [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH])
+def test_score(input_batch):
     with tempfile.NamedTemporaryFile(
             "w") as input_file, tempfile.NamedTemporaryFile(
                 "r") as output_file:
-        input_file.write(INPUT_SCORE_BATCH)
+        input_file.write(input_batch)
         input_file.flush()
         proc = subprocess.Popen([
             "vllm",
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index a7f85e9ee..2f641079e 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1481,6 +1481,10 @@ class TranscriptionStreamResponse(OpenAIBaseModel):
     usage: Optional[UsageInfo] = Field(default=None)
 
 
+BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest,
+                              ScoreRequest, RerankRequest]
+
+
 class BatchRequestInput(OpenAIBaseModel):
     """
     The per-line object of the batch input file.
@@ -1501,21 +1505,22 @@ class BatchRequestInput(OpenAIBaseModel):
     url: str
 
     # The parameters of the request.
-    body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest]
+    body: BatchRequestInputBody
 
     @field_validator('body', mode='plain')
     @classmethod
     def check_type_for_url(cls, value: Any, info: ValidationInfo):
         # Use url to disambiguate models
-        url = info.data['url']
+        url: str = info.data["url"]
         if url == "/v1/chat/completions":
             return ChatCompletionRequest.model_validate(value)
         if url == "/v1/embeddings":
             return TypeAdapter(EmbeddingRequest).validate_python(value)
-        if url == "/v1/score":
+        if url.endswith("/score"):
             return ScoreRequest.model_validate(value)
-        return TypeAdapter(Union[ChatCompletionRequest, EmbeddingRequest,
-                                 ScoreRequest]).validate_python(value)
+        if url.endswith("/rerank"):
+            return RerankRequest.model_validate(value)
+        return TypeAdapter(BatchRequestInputBody).validate_python(value)
 
 
 class BatchResponseData(OpenAIBaseModel):
@@ -1527,7 +1532,7 @@ class BatchResponseData(OpenAIBaseModel):
 
     # The body of the response.
     body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
-                         ScoreResponse]] = None
+                         ScoreResponse, RerankResponse]] = None
 
 
 class BatchRequestOutput(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index f38465b22..ac250b3cb 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -21,7 +21,7 @@ from vllm.entrypoints.openai.protocol import (BatchRequestInput,
                                               BatchResponseData,
                                               ChatCompletionResponse,
                                               EmbeddingResponse, ErrorResponse,
-                                              ScoreResponse)
+                                              RerankResponse, ScoreResponse)
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
@@ -274,8 +274,11 @@ async def run_request(serving_engine_func: Callable,
                       tracker: BatchProgressTracker) -> BatchRequestOutput:
     response = await serving_engine_func(request.body)
 
-    if isinstance(response,
-                  (ChatCompletionResponse, EmbeddingResponse, ScoreResponse)):
+    if isinstance(
+            response,
+        (ChatCompletionResponse, EmbeddingResponse, ScoreResponse,
+         RerankResponse),
+    ):
         batch_output = BatchRequestOutput(
             id=f"vllm-{random_uuid()}",
             custom_id=request.custom_id,
@@ -397,7 +400,7 @@ async def main(args):
             response_futures.append(
                 run_request(embed_handler_fn, request, tracker))
             tracker.submitted()
-        elif request.url == "/v1/score":
+        elif request.url.endswith("/score"):
             score_handler_fn = openai_serving_scores.create_score if \
                 openai_serving_scores is not None else None
             if score_handler_fn is None:
@@ -411,13 +414,29 @@ async def main(args):
             response_futures.append(
                 run_request(score_handler_fn, request, tracker))
             tracker.submitted()
+        elif request.url.endswith("/rerank"):
+            rerank_handler_fn = openai_serving_scores.do_rerank if \
+                openai_serving_scores is not None else None
+            if rerank_handler_fn is None:
+                response_futures.append(
+                    make_async_error_request_output(
+                        request,
+                        error_msg="The model does not support Rerank API",
+                    ))
+                continue
+
+            response_futures.append(
+                run_request(rerank_handler_fn, request, tracker))
+            tracker.submitted()
         else:
             response_futures.append(
                 make_async_error_request_output(
                     request,
-                    error_msg=
-                    "Only /v1/chat/completions, /v1/embeddings, and /v1/score "
-                    "are supported in the batch endpoint.",
+                    error_msg=f"URL {request.url} was used. "
+                    "Supported endpoints: /v1/chat/completions, /v1/embeddings,"
+                    " /score, /rerank ."
+                    "See vllm/entrypoints/openai/api_server.py for supported "
+                    "score/rerank versions.",
                 ))
 
     with tracker.pbar():
-- 
GitLab


From 1e123529d7df1ff8f868b19aeced6a64e67bd618 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Sat, 31 May 2025 01:43:44 -0700
Subject: [PATCH 088/274] [Misc] Fix estimated max model len msg (#18966)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 vllm/v1/core/kv_cache_utils.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 403b5401b..a41fe4881 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -544,16 +544,17 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
                                                    available_memory)
         estimated_msg = ""
         if estimated_max_len > 0:
-            estimated_msg = " Based on the available memory,"
-            f" the estimated maximum model length is {estimated_max_len}."
+            estimated_msg = (
+                "Based on the available memory, "
+                f"the estimated maximum model length is {estimated_max_len}.")
 
         raise ValueError(
             f"To serve at least one request with the models's max seq len "
             f"({max_model_len}), ({needed_memory/GiB_bytes:.2f} GiB KV "
             f"cache is needed, which is larger than the available KV cache "
-            f"memory ({available_memory/GiB_bytes:.2f} GiB)."
+            f"memory ({available_memory/GiB_bytes:.2f} GiB). "
             f"{estimated_msg} "
-            f" Try increasing `gpu_memory_utilization` or decreasing "
+            f"Try increasing `gpu_memory_utilization` or decreasing "
             f"`max_model_len` when initializing the engine.")
 
 
-- 
GitLab


From ba5111f2372678fb03e96ee69decd0febd03f13e Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Sat, 31 May 2025 17:20:54 +0800
Subject: [PATCH 089/274] [Bugfix]: Fix the incompatibility issue with
 Structured Outputs when Thinking is disabled (#18879)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/v1/structured_output/__init__.py | 30 ++++++++++++++++-----------
 vllm/v1/structured_output/request.py  |  2 +-
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index c701ab1d3..07b422814 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -149,31 +149,37 @@ class StructuredOutputManager:
         # NOTE: This outer loop can likely be parallelized to improve
         # performance of bitmask generation for large batches.
         for req_id, _ in ordered_seq:
-            request = requests[req_id].structured_output_request
-            if TYPE_CHECKING:
-                assert request is not None
-                assert request.grammar is not None
+            request = requests[req_id]
+            structured_output_request = request.structured_output_request
 
-            apply_bitmask = (
-                request.reasoning_ended if self.reasoner is not None else True
-            )  # noqa: E501
+            if TYPE_CHECKING:
+                assert structured_output_request is not None
+                assert structured_output_request.grammar is not None
+            apply_bitmask: bool = True
+            if self.reasoner is not None:
+                if structured_output_request.reasoning_ended is None:
+                    structured_output_request.reasoning_ended = \
+                        self.reasoner.is_reasoning_end(request.prompt_token_ids)
+                apply_bitmask = structured_output_request.reasoning_ended
 
             state_advancements = 0
             req_tokens = scheduled_spec_decode_tokens.get(req_id, []) + [None]
             for i, token in enumerate(req_tokens):
-                if apply_bitmask and not request.grammar.is_terminated():
-                    request.grammar.fill_bitmask(bitmask_tensor,
-                                                 cumulative_index)
+                if apply_bitmask and not \
+                    structured_output_request.grammar.is_terminated():
+                    structured_output_request.grammar.fill_bitmask(
+                        bitmask_tensor, cumulative_index)
                     if token is not None:
                         # In order to generate the correct bitmask for each
                         # position in the speculative sequence, we advance
                         # the FSM state for each speculative token and rollback
                         # to restore the previous state when we are finished.
-                        assert request.grammar.accept_tokens(req_id, [token])
+                        assert structured_output_request.grammar.accept_tokens(
+                            req_id, [token])
                         state_advancements += 1
                 cumulative_index += 1
             if state_advancements > 0:
-                request.grammar.rollback(state_advancements)
+                structured_output_request.grammar.rollback(state_advancements)
 
         if cumulative_index < bitmask_tensor.shape[0]:
             bitmask_tensor = bitmask_tensor[:cumulative_index]
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
index c16320b9e..9a7e30d41 100644
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -20,7 +20,7 @@ class StructuredOutputRequest:
     sampling_params: SamplingParams
     _grammar: Optional[Union[Future[StructuredOutputGrammar],
                              StructuredOutputGrammar]] = None
-    reasoning_ended: bool = False
+    reasoning_ended: Optional[bool] = None
 
     def _check_grammar_completion(self) -> bool:
         # NOTE: We have to lazy import to gate circular imports
-- 
GitLab


From b8b904795d3033c21bcb8f5c36e135e75dc1baf2 Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Sat, 31 May 2025 03:38:56 -0700
Subject: [PATCH 090/274] fix security issue of logging llm output (#18980)

Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
---
 vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
index b403a1467..00690ad79 100644
--- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -68,8 +68,8 @@ class Phi4MiniJsonToolParser(ToolParser):
                              len(function_call_arr))
             except json.JSONDecodeError as e:
                 logger.error(
-                    "Failed to parse function calls from model output: %s. "
-                    "Error: %s", model_output, str(e))
+                    "Failed to parse function calls from model output. "
+                    "Error: %s", str(e))
 
             tool_calls: list[ToolCall] = [
                 ToolCall(
-- 
GitLab


From 2a50ef57605ffe332a73e50597276b71e9d52676 Mon Sep 17 00:00:00 2001
From: Satyajith Chilappagari <satchill@amazon.com>
Date: Sat, 31 May 2025 03:39:11 -0700
Subject: [PATCH 091/274] [Neuron] Add Multi-Modal model support for Neuron
 (#18921)

Signed-off-by: Satyajith Chilappagari <satchill@amazon.com>
Co-authored-by: Ashraf Mahgoub <ashymahg@amazon.com>
Co-authored-by: Rohith Nallamaddi <nalrohit@amazon.com>
Co-authored-by: FeliciaLuo <luof@amazon.com>
Co-authored-by: Elaine Zhao <elaineyz@amazon.com>
---
 .../offline_inference/neuron_multimodal.py    | 105 ++++++++++++++++++
 vllm/config.py                                |  10 ++
 .../model_loader/neuronx_distributed.py       |  65 ++++++++++-
 vllm/worker/neuron_model_runner.py            |  13 +++
 .../neuronx_distributed_model_runner.py       |  88 ++++++++-------
 5 files changed, 235 insertions(+), 46 deletions(-)
 create mode 100644 examples/offline_inference/neuron_multimodal.py

diff --git a/examples/offline_inference/neuron_multimodal.py b/examples/offline_inference/neuron_multimodal.py
new file mode 100644
index 000000000..a9478650b
--- /dev/null
+++ b/examples/offline_inference/neuron_multimodal.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+import requests
+import torch
+from neuronx_distributed_inference.models.mllama.utils import add_instruct
+from PIL import Image
+
+from vllm import LLM, SamplingParams, TextPrompt
+
+
+def get_image(image_url):
+    image = Image.open(requests.get(image_url, stream=True).raw)
+    return image
+
+
+# Model Inputs
+PROMPTS = [
+    "What is in this image? Tell me a story",
+    "What is the recipe of mayonnaise in two sentences?",
+    "Describe this image",
+    "What is the capital of Italy famous for?",
+]
+IMAGES = [
+    get_image(
+        "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
+    ),
+    None,
+    get_image(
+        "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
+    ),
+    None,
+]
+SAMPLING_PARAMS = [
+    dict(top_k=1, temperature=1.0, top_p=1.0, max_tokens=16)
+    for _ in range(len(PROMPTS))
+]
+
+
+def get_VLLM_mllama_model_inputs(prompt, single_image, sampling_params):
+    # Prepare all inputs for mllama generation, including:
+    # 1. put text prompt into instruct chat template
+    # 2. compose single text and single image prompt into Vllm's prompt class
+    # 3. prepare sampling parameters
+    input_image = single_image
+    has_image = torch.tensor([1])
+    if isinstance(single_image, torch.Tensor) and single_image.numel() == 0:
+        has_image = torch.tensor([0])
+
+    instruct_prompt = add_instruct(prompt, has_image)
+    inputs = TextPrompt(prompt=instruct_prompt)
+
+    if input_image is not None:
+        inputs["multi_modal_data"] = {"image": input_image}
+
+    sampling_params = SamplingParams(**sampling_params)
+    return inputs, sampling_params
+
+
+def print_outputs(outputs):
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+    assert (
+        len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS)
+    ), f"""Text, image prompts and sampling parameters should have the 
+            same batch size; but got {len(PROMPTS)}, {len(IMAGES)}, 
+            and {len(SAMPLING_PARAMS)}"""
+
+    # Create an LLM.
+    llm = LLM(
+        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+        max_num_seqs=1,
+        max_model_len=4096,
+        block_size=4096,
+        device="neuron",
+        tensor_parallel_size=32,
+        override_neuron_config={
+            "sequence_parallel_enabled": False,
+            "skip_warmup": True,
+            "save_sharded_checkpoint": True,
+            "on_device_sampling_config": {
+                "global_topk": 1,
+                "dynamic": False,
+                "deterministic": False,
+            },
+        },
+    )
+
+    batched_inputs = []
+    batched_sample_params = []
+    for pmpt, img, params in zip(PROMPTS, IMAGES, SAMPLING_PARAMS):
+        inputs, sampling_params = get_VLLM_mllama_model_inputs(pmpt, img, params)
+        # test batch-size = 1
+        outputs = llm.generate(inputs, sampling_params)
+        print_outputs(outputs)
+        batched_inputs.append(inputs)
+        batched_sample_params.append(sampling_params)
+
+    # test batch-size = 4
+    outputs = llm.generate(batched_inputs, batched_sample_params)
+    print_outputs(outputs)
diff --git a/vllm/config.py b/vllm/config.py
index 6cec97a5f..dfa44b044 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1360,6 +1360,16 @@ class ModelConfig:
     @property
     def is_encoder_decoder(self) -> bool:
         """Extract the HF encoder/decoder model flag."""
+        """
+        For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to 
+        True to enable cross-attention
+        Neuron needs all multimodal data to be in the decoder and does not 
+        need to explicitly enable cross-attention
+        """
+        if (current_platform.is_neuron()
+                and self.hf_config.model_type == "mllama"):
+            return False
+
         return is_encoder_decoder(self.hf_config)
 
     @property
diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py
index 624bd476c..72ad4da29 100644
--- a/vllm/model_executor/model_loader/neuronx_distributed.py
+++ b/vllm/model_executor/model_loader/neuronx_distributed.py
@@ -204,6 +204,11 @@ class NeuronMllamaForCausalLM(nn.Module):
                  config: PretrainedConfig,
                  on_device_sampling_disabled: bool = False) -> None:
         super().__init__()
+        # has_image is the only multimodal input that is used in
+        # token-generation
+        # This is a cache (on CPU) that saves has_image data per sequence id
+        # The number of entries in this cache is <= Batch-Size
+        self.has_image_cache: dict[int, torch.Tensor] = {}
         self.config = config
         self.logits_processor = LogitsProcessor(
             config.get_text_config().vocab_size, logits_as_input=True)
@@ -215,11 +220,57 @@ class NeuronMllamaForCausalLM(nn.Module):
 
         # Lazy initialized
         self.model: nn.Module
+        self.is_reorder_needed: bool = True
+
+    def read_from_has_image_cache(self, seq_ids: torch.Tensor):
+        has_image_list = []
+        for index in range(len(seq_ids)):
+            seq_id = seq_ids[index].item()
+            if seq_id in self.has_image_cache:
+                has_image_list.append(self.has_image_cache[seq_id])
+            else:
+                has_image_list.append(torch.tensor([0]))
+        return torch.tensor(has_image_list)
+
+    def write_to_has_image_cache(self, seq_ids: torch.Tensor,
+                                 has_image: torch.Tensor):
+        for index in range(len(seq_ids)):
+            seq_id = seq_ids[index].item()
+            if index < len(has_image):
+                self.has_image_cache[seq_id] = has_image[index]
+            else:
+                self.has_image_cache[seq_id] = torch.zeros(1)
 
     def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                 seq_ids: torch.Tensor, pixel_values: torch.Tensor,
                 aspect_ratios: torch.Tensor, num_chunks: torch.Tensor,
                 has_image: torch.Tensor, sampling_params) -> torch.Tensor:
+
+        # We update the has_image cache during prefill
+        # and read the has_image cache during decode
+        if input_ids.shape[-1] > 1:  # prefill
+            self.write_to_has_image_cache(seq_ids, has_image)
+        else:
+            has_image = self.read_from_has_image_cache(seq_ids)
+            bs = input_ids.shape[0]
+            num_chunks = torch.zeros((bs, 1))
+            aspect_ratios = torch.zeros((bs, 1, 2))
+
+        input_block_ids = seq_ids
+        origin_input_block_ids = seq_ids
+        if self.is_reorder_needed:
+            # sort block ids sequentially for perf/neuron support reasons
+            input_block_ids, sorted_indices = torch.sort(input_block_ids)
+            input_ids = torch.index_select(input_ids, 0, sorted_indices)
+            positions = torch.index_select(positions, 0, sorted_indices)
+            sampling_params = torch.index_select(sampling_params, 0,
+                                                 sorted_indices)
+            pixel_values = torch.index_select(pixel_values, 0, sorted_indices)
+            aspect_ratios = torch.index_select(aspect_ratios, 0,
+                                               sorted_indices)
+            num_chunks = torch.index_select(num_chunks, 0, sorted_indices)
+            has_image = torch.index_select(has_image, 0, sorted_indices)
+
         self.vision_mask = create_vision_mask(input_ids, self.vision_token_id)
         output = self.model(
             input_ids.to(torch.int32),
@@ -235,8 +286,14 @@ class NeuronMllamaForCausalLM(nn.Module):
             has_image=has_image.to(torch.int32),
         )
         if self.config.neuron_config.on_device_sampling_config:
-            return output.hidden_states
-        return output.logits[:, -1, :]
+            output = output.hidden_states
+        else:
+            output = output.logits[:, -1, :]
+
+        if self.is_reorder_needed and origin_input_block_ids.shape[0] != 1:
+            restored_indices = torch.argsort(sorted_indices)
+            output = torch.index_select(output, 0, restored_indices)
+        return output
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
@@ -299,7 +356,7 @@ class NeuronMllamaForCausalLM(nn.Module):
             self.model = neuronx_model_cls(compiled_model_path)
             tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
             self.vision_token_id = tokenizer(
-                "<|image|>", add_special_tokens=False).input_ids
+                "<|image|>", add_special_tokens=False).input_ids[0]
             self.model.load(compiled_model_path)
             return
         except (FileNotFoundError, ValueError):
@@ -326,7 +383,7 @@ class NeuronMllamaForCausalLM(nn.Module):
 
         # Read "<|image|>" token_id from the tokenizer
         self.vision_token_id = tokenizer("<|image|>",
-                                         add_special_tokens=False).input_ids
+                                         add_special_tokens=False).input_ids[0]
         logger.info("\nLoading model from compiled checkpoint...")
         self.model.load(compiled_model_path)
 
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 292fe57f3..3aff3e01a 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -169,6 +169,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
 
             mm_kwargs = seq_group_metadata.multi_modal_data
             if mm_kwargs:
+                mm_kwargs = self.process_multi_modal_data_neuron(mm_kwargs)
                 multi_modal_kwargs_list.append(mm_kwargs)
 
         max_seq_len = max(seq_lens)
@@ -274,6 +275,14 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
                 sampling_params.top_p = top_p
                 sampling_params.temperature = temperature
 
+        # we need multi_modal_data for later tokens as well
+        multi_modal_kwargs_list: List[MultiModalKwargs] = []
+        for seq_group_metadata in seq_group_metadata_list:
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                multi_modal_kwargs_list.append(mm_data)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
+
         sampling_metadata = SamplingMetadata.prepare(
             seq_group_metadata_list,
             seq_lens,
@@ -422,6 +431,10 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
     def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()
 
+    def process_multi_modal_data_neuron(self, mm_data):
+        # this is a no-op for NeuronModelRunner
+        return mm_data
+
     def remove_all_loras(self):
         raise NotImplementedError(
             "LoRAs are not supported for Transformers NeuronX framework")
diff --git a/vllm/worker/neuronx_distributed_model_runner.py b/vllm/worker/neuronx_distributed_model_runner.py
index aa94706c8..9cd4f88d3 100644
--- a/vllm/worker/neuronx_distributed_model_runner.py
+++ b/vllm/worker/neuronx_distributed_model_runner.py
@@ -3,6 +3,8 @@
 from typing import List, Optional, Set
 
 import torch
+from neuronx_distributed_inference.models.mllama.aspect_ratio_utils import (
+    get_all_supported_aspect_ratios)
 from neuronx_distributed_inference.modules.generation.sampling import (
     prepare_sampling_params)
 from neuronx_distributed_inference.modules.lora_serving import (
@@ -17,7 +19,7 @@ from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader.neuronx_distributed import (
     _get_model_architecture, get_neuron_model)
-from vllm.platforms import current_platform
+from vllm.multimodal import MultiModalKwargs
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 from vllm.worker.neuron_model_runner import (ModelInputForNeuron,
                                              NeuronModelRunner)
@@ -121,42 +123,28 @@ class NeuronxDistributedModelRunner(NeuronModelRunner):
         sampling_params = self.get_nxd_sampling_params(
             model_input.sampling_metadata)
 
-        if model_input.multi_modal_kwargs.get('image') is not None:
-            pixel_values = []
-            aspect_ratios = []
-            num_chunks = []
-            has_image = []
-            for multi_modal_input in model_input.multi_modal_kwargs.get(
-                    'image'):
-                image_tensors = self.get_multi_modal_data_neuron(
-                    multi_modal_input.squeeze(0))
-                pixel_values.append(image_tensors[0])
-                aspect_ratios.append(image_tensors[1])
-                num_chunks.append(image_tensors[2])
-                has_image.append(image_tensors[3])
-
-            pixel_values = torch.cat(pixel_values, dim=0)
-            aspect_ratios = torch.cat(aspect_ratios, dim=0)
-            num_chunks = torch.cat(num_chunks, dim=0)
-            has_image = torch.cat(has_image, dim=0)
-
+        if model_input.multi_modal_kwargs.get('pixel_values') is not None:
             hidden_states = self.model(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
                 seq_ids=model_input.input_block_ids,
-                pixel_values=pixel_values,
-                aspect_ratios=aspect_ratios,
+                pixel_values=model_input.multi_modal_kwargs.get(
+                    'pixel_values'),
+                aspect_ratios=model_input.multi_modal_kwargs.get(
+                    'aspect_ratios'),
                 sampling_params=sampling_params,
-                num_chunks=num_chunks,
-                has_image=has_image,
+                num_chunks=model_input.multi_modal_kwargs.get('num_chunks'),
+                has_image=model_input.multi_modal_kwargs.get(
+                    'has_image').squeeze(1),
             )
         else:
-            empty_pixel_values = torch.zeros([1, 1, 4, 3, 560, 560],
+            bs = model_input.input_tokens.shape[0] if (model_input.input_tokens
+                                                       is not None) else 1
+            empty_pixel_values = torch.zeros([bs, 1, 4, 3, 560, 560],
                                              dtype=torch.bfloat16)
-            empty_aspect_ratios = torch.ones([1, 1, 2], dtype=torch.int64)
-            num_chunks = torch.tensor([[1]
-                                       ])  # dummy num_chunks, will not be used
-            has_image = torch.tensor([0])
+            empty_aspect_ratios = torch.ones([bs, 1, 2], dtype=torch.int64)
+            num_chunks = torch.zeros((bs, 1), dtype=torch.int32)
+            has_image = torch.zeros([bs], dtype=torch.int32)
             hidden_states = self.model(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
@@ -175,6 +163,27 @@ class NeuronxDistributedModelRunner(NeuronModelRunner):
 
         return [output]
 
+    def process_multi_modal_data_neuron(self, mm_data):
+        # Neuron uses aspect_ratios instead of aspect_ratio_ids
+        all_supported_aspect_ratios = get_all_supported_aspect_ratios(
+            self.model.config.vision_config.max_num_tiles)
+        aspect_ratio_ids = mm_data.get("aspect_ratio_ids")
+        mm_data["aspect_ratios"] = torch.tensor(
+            all_supported_aspect_ratios[aspect_ratio_ids]).unsqueeze(0)
+
+        # Neuron's num_chunks is HF's num_tiles
+        mm_data["num_chunks"] = mm_data.get("num_tiles")
+
+        # Input has an image if it has pixel_values
+        bs = mm_data["num_chunks"].shape[0]
+        pixel_values = mm_data.get("pixel_values")
+        if pixel_values is not None and not torch.all(pixel_values == 0):
+            mm_data["has_image"] = torch.ones(bs)
+
+        else:
+            mm_data["has_image"] = torch.zeros(bs)
+        return mm_data
+
     def _get_lora_adapter_ids(self, seq_group_metadata_list):
         # set LoRA adapter IDs for multi-lora serving
         batch_size = len(seq_group_metadata_list)
@@ -200,7 +209,6 @@ class NeuronxDistributedModelRunner(NeuronModelRunner):
         virtual_engine: int = 0,
         finished_requests_ids: Optional[List[str]] = None
     ) -> ModelInputForNeuron:
-        multi_modal_kwargs = None
         # NOTE: We assume that all sequences in the group are all prompts or
         # all decodes.
         is_prompt = seq_group_metadata_list[0].is_prompt
@@ -223,6 +231,14 @@ class NeuronxDistributedModelRunner(NeuronModelRunner):
                 sampling_params.top_p = top_p
                 sampling_params.temperature = temperature
 
+        # we need multi_modal_data for later tokens as well
+        multi_modal_kwargs_list: List[MultiModalKwargs] = []
+        for seq_group_metadata in seq_group_metadata_list:
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                multi_modal_kwargs_list.append(mm_data)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
+
         lora_adapter_ids = self._get_lora_adapter_ids(seq_group_metadata_list)
 
         sampling_metadata = SamplingMetadata.prepare(
@@ -236,18 +252,6 @@ class NeuronxDistributedModelRunner(NeuronModelRunner):
             self.pin_memory,
             generators=self.get_generators(finished_requests_ids))
 
-        if current_platform.use_transformers_neuronx(
-        ) and not self._on_device_sampling_disabled:
-            # Once the request IDs are changed in current iteration, we will
-            # update the on-device sampling parameters.
-            current_batch_request_ids = [
-                seq_group_meta_data.request_id
-                for seq_group_meta_data in seq_group_metadata_list
-            ]
-            if current_batch_request_ids != self._previous_batch_request_ids:
-                self._update_neuron_sampling_params(seq_group_metadata_list)
-                self._previous_batch_request_ids = current_batch_request_ids
-
         return ModelInputForNeuron(input_tokens=input_tokens,
                                    input_positions=input_positions,
                                    input_block_ids=input_block_ids,
-- 
GitLab


From 749f5bdd3879e8e7ebce1c56f9960481a6cf63bb Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 31 May 2025 18:39:21 +0800
Subject: [PATCH 092/274] [doc] fix the list rendering issue - security.md
 (#18982)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/usage/security.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/usage/security.md b/docs/usage/security.md
index f1661828d..1209cc8dd 100644
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@@ -12,14 +12,14 @@ All communications between nodes in a multi-node vLLM deployment are **insecure
 
 The following options control inter-node communications in vLLM:
 
-1. **Environment Variables:**
+#### 1. **Environment Variables:**
    - `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on
 
-2. **KV Cache Transfer Configuration:**
+#### 2. **KV Cache Transfer Configuration:**
    - `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1)
    - `--kv-port`: The port for KV cache transfer communications (default: 14579)
 
-3. **Data Parallel Configuration:**
+#### 3. **Data Parallel Configuration:**
    - `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1)
    - `data_parallel_master_port`: Port of the data parallel master (default: 29500)
 
@@ -39,16 +39,16 @@ Key points from the PyTorch security guide:
 
 ### Security Recommendations
 
-1. **Network Isolation:**
+#### 1. **Network Isolation:**
    - Deploy vLLM nodes on a dedicated, isolated network
    - Use network segmentation to prevent unauthorized access
    - Implement appropriate firewall rules
 
-2. **Configuration Best Practices:**
+#### 2. **Configuration Best Practices:**
    - Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults
    - Configure firewalls to only allow necessary ports between nodes
 
-3. **Access Control:**
+#### 3. **Access Control:**
    - Restrict physical and network access to the deployment environment
    - Implement proper authentication and authorization for management interfaces
    - Follow the principle of least privilege for all system components
-- 
GitLab


From c55d8046723325e09521a24ac076a8a7e64eaa52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Sat, 31 May 2025 06:39:28 -0400
Subject: [PATCH 093/274] [BugFix] Pydantic part 2 (#18911)

Signed-off-by: luka <luka@neuralmagic.com>
---
 requirements/test.in  | 1 +
 requirements/test.txt | 8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/requirements/test.in b/requirements/test.in
index 87af61769..e906752ff 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -51,3 +51,4 @@ numpy
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
+pydantic>=2.10 # 2.9 leads to error on python 3.10
\ No newline at end of file
diff --git a/requirements/test.txt b/requirements/test.txt
index 89d477017..60dcaca81 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -480,12 +480,13 @@ pycparser==2.22
     # via cffi
 pycryptodomex==3.22.0
     # via blobfile
-pydantic==2.9.2
+pydantic==2.11.5
     # via
+    #   -r requirements/test.in
     #   datamodel-code-generator
     #   mistral-common
     #   mteb
-pydantic-core==2.23.4
+pydantic-core==2.33.2
     # via pydantic
 pygments==2.18.0
     # via rich
@@ -784,6 +785,9 @@ typing-extensions==4.12.2
     #   pydantic-core
     #   torch
     #   typer
+    #   typing-inspection
+typing-inspection==0.4.1
+    # via pydantic
 tzdata==2024.2
     # via pandas
 uri-template==1.3.0
-- 
GitLab


From 0f5e0d567e9949ffee4f1677bc9dd767804eadd0 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Sat, 31 May 2025 18:39:31 +0800
Subject: [PATCH 094/274] [FEAT][ROCm] Add AITER grouped topk for DeepSeekV2
 (#18825)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 tests/kernels/moe/test_rocm_aiter_topk.py     | 93 +++++++++++++++++++
 vllm/model_executor/layers/fused_moe/layer.py |  2 +-
 .../layers/fused_moe/rocm_aiter_fused_moe.py  | 78 +++++++++++++---
 3 files changed, 157 insertions(+), 16 deletions(-)

diff --git a/tests/kernels/moe/test_rocm_aiter_topk.py b/tests/kernels/moe/test_rocm_aiter_topk.py
index b0d34ddfd..922fd66db 100644
--- a/tests/kernels/moe/test_rocm_aiter_topk.py
+++ b/tests/kernels/moe/test_rocm_aiter_topk.py
@@ -35,6 +35,15 @@ def test_rocm_aiter_biased_grouped_topk_custom_op_registration():
     assert callable(torch.ops.vllm.rocm_aiter_biased_grouped_topk)
 
 
+def test_rocm_aiter_grouped_topk_custom_op_registration():
+    """Test that the custom op is correctly registered."""
+    # Check if the op exists in torch.ops.vllm
+    assert hasattr(torch.ops.vllm, 'rocm_aiter_grouped_topk')
+
+    # Check if the op is callable
+    assert callable(torch.ops.vllm.rocm_aiter_grouped_topk)
+
+
 def test_rocm_aiter_biased_grouped_topk_torch_compile_compatibility():
     """Test that the op can be used with torch.compile."""
     # Create test tensors
@@ -120,3 +129,87 @@ def test_rocm_aiter_biased_grouped_topk_torch_compile_compatibility():
                           rtol=1e-2,
                           atol=1e-2)
     assert torch.allclose(topk_ids_original, topk_ids_compiled)
+
+
+def test_rocm_aiter_grouped_topk_torch_compile_compatibility():
+    """Test that the op can be used with torch.compile."""
+    # Create test tensors
+    token = 64
+    expert = 256
+    num_expert_group = 8
+    topk = 8
+    topk_group = 4
+    renormalize = True
+    scoring_func = "softmax"
+    scale_factor = 1.0
+
+    gating_output = torch.randn((token, expert),
+                                dtype=torch.bfloat16,
+                                device="cuda")
+
+    device = gating_output.device
+    topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
+    topk_weights = torch.empty((token, topk),
+                               dtype=torch.float32,
+                               device=device)
+
+    # Define a function that uses the op
+    def grouped_topk_fn(gating_output, topk_weights, topk_ids, scoring_func):
+        return torch.ops.vllm.rocm_aiter_grouped_topk(
+            gating_output, topk_weights, topk_ids, num_expert_group,
+            topk_group, renormalize, scoring_func, scale_factor)
+
+    # Verify the op's fake implementation
+    torch.library.opcheck(torch.ops.vllm.rocm_aiter_grouped_topk,
+                          (gating_output, topk_weights, topk_ids),
+                          kwargs={
+                              "num_expert_group": num_expert_group,
+                              "topk_group": topk_group,
+                              "need_renorm": renormalize,
+                              "scoring_func": scoring_func,
+                              "routed_scaling_factor": scale_factor
+                          },
+                          test_utils=("test_faketensor"))
+
+    # Compile the function with appropriate settings
+    compiled_fn = torch.compile(grouped_topk_fn,
+                                fullgraph=True,
+                                backend="inductor",
+                                mode="reduce-overhead",
+                                dynamic=False)
+
+    topk_weights_original = torch.empty((token, topk),
+                                        dtype=torch.float32,
+                                        device=device)
+    topk_ids_original = torch.empty((token, topk),
+                                    dtype=torch.int32,
+                                    device=device)
+
+    topk_weights_compiled = torch.empty((token, topk),
+                                        dtype=torch.float32,
+                                        device=device)
+    topk_ids_compiled = torch.empty((token, topk),
+                                    dtype=torch.int32,
+                                    device=device)
+
+    # Run both compiled (V1 graph mode) and uncompiled versions (V1 eager mode)
+    grouped_topk_fn(gating_output, topk_weights_original, topk_ids_original,
+                    scoring_func)
+    compiled_fn(gating_output, topk_weights_compiled, topk_ids_compiled,
+                scoring_func)
+
+    # Sort the results for comparison since the order might not be deterministic
+    topk_ids_original, indices_original = torch.sort(topk_ids_original)
+    topk_weights_original = torch.gather(topk_weights_original, 1,
+                                         indices_original)
+
+    topk_ids_compiled, indices_compiled = torch.sort(topk_ids_compiled)
+    topk_weights_compiled = torch.gather(topk_weights_compiled, 1,
+                                         indices_compiled)
+
+    # Verify results match
+    assert torch.allclose(topk_weights_original,
+                          topk_weights_compiled,
+                          rtol=1e-2,
+                          atol=1e-2)
+    assert torch.allclose(topk_ids_original, topk_ids_compiled)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 838a7c24b..af7b98e14 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -45,7 +45,7 @@ else:
     FusedMoEPrepareAndFinalize = None  # type: ignore
 if is_rocm_aiter_moe_enabled():
     from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
-        rocm_aiter_biased_group_topk as grouped_topk)
+        rocm_aiter_grouped_topk as grouped_topk)
 else:
     from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
 if current_platform.is_tpu():
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 10b61fcda..824062491 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -140,6 +140,36 @@ def rocm_aiter_biased_grouped_topk_fake(
     pass
 
 
+def rocm_aiter_grouped_topk_impl(
+        gating_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_expert_group: int,
+        topk_group: int,
+        need_renorm: bool,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0  # mul to topk_weights
+) -> None:
+
+    from aiter import grouped_topk
+
+    grouped_topk(gating_output, topk_weights, topk_ids, num_expert_group,
+                 topk_group, need_renorm, scoring_func, routed_scaling_factor)
+
+
+def rocm_aiter_grouped_topk_fake(
+        gating_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_expert_group: int,
+        topk_group: int,
+        need_renorm: bool,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0  # mul to topk_weights
+) -> None:
+    pass
+
+
 def rocm_aiter_fused_moe_impl(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
@@ -218,36 +248,54 @@ if current_platform.is_rocm():
         dispatch_key=current_platform.dispatch_key,
     )
 
+    direct_register_custom_op(
+        op_name="rocm_aiter_grouped_topk",
+        op_func=rocm_aiter_grouped_topk_impl,
+        mutates_args=["topk_weights", "topk_ids"],
+        fake_impl=rocm_aiter_grouped_topk_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
 
-def rocm_aiter_biased_group_topk(
+def rocm_aiter_grouped_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
     num_expert_group: int = 0,
     topk_group: int = 0,
-    scoring_func: str = "sigmoid",
+    scoring_func: str = "softmax",
     e_score_correction_bias: Optional[torch.Tensor] = None
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    assert scoring_func == "sigmoid", (
-        "rocm_aiter_biased_group_topk only supports 'sigmoid' scoring_func.")
-    assert e_score_correction_bias is not None, (
-        "'e_score_correction_bias' must not be None.")
     token = hidden_states.shape[0]
     device = hidden_states.device
     topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
     topk_weights = torch.empty((token, topk),
                                dtype=torch.float32,
                                device=device)
-    torch.ops.vllm.rocm_aiter_biased_grouped_topk(
-        gating_output,
-        e_score_correction_bias,
-        topk_weights,
-        topk_ids,
-        num_expert_group,
-        topk_group,
-        renormalize,
-    )
+
+    if e_score_correction_bias is not None:
+        torch.ops.vllm.rocm_aiter_biased_grouped_topk(
+            gating_output,
+            e_score_correction_bias,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            renormalize,
+        )
+    else:
+        assert (scoring_func == "softmax" or scoring_func == "sigmoid")
+        torch.ops.vllm.rocm_aiter_grouped_topk(
+            gating_output,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            renormalize,
+            scoring_func,
+        )
+
     return topk_weights, topk_ids
 
 
-- 
GitLab


From f2c3f66d59f9e38aa94985b54f370219222e7bd1 Mon Sep 17 00:00:00 2001
From: Fred Reiss <frreiss@us.ibm.com>
Date: Sat, 31 May 2025 04:58:17 -0700
Subject: [PATCH 095/274] [Bugfix] Fix for issue 17396 (#18773)

Signed-off-by: Fred Reiss <frreiss@us.ibm.com>
---
 vllm/lora/ops/torch_ops/lora_ops.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py
index af79f9841..ab65faceb 100644
--- a/vllm/lora/ops/torch_ops/lora_ops.py
+++ b/vllm/lora/ops/torch_ops/lora_ops.py
@@ -36,10 +36,13 @@ def bgmv_expand(inputs: torch.Tensor,
     if outputs.shape[0] == 1 and output_tensor.shape[0] != 1:
         limit = 1
 
+    # LoRA adapter and model may add different amounts of padding to output
+    common_len = min(outputs.shape[1], output_tensor.shape[1])
+
     if add_inputs:
-        output_tensor[:, :outputs.shape[1]] += outputs[:limit, :]
+        output_tensor[:, :common_len] += outputs[:limit, :common_len]
     else:
-        output_tensor[:, :outputs.shape[1]] = outputs[:limit, :]
+        output_tensor[:, :common_len] = outputs[:limit, :common_len]
 
 
 def sgmv_shrink(
-- 
GitLab


From 306d60401dbd066f64298e02ca73d4f2075d7bf6 Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Sat, 31 May 2025 09:40:05 -0500
Subject: [PATCH 096/274] [ROCm][Kernel] Add gfx950 support for skinny gemms
 (#18010)

Signed-off-by: charlifu <charlifu@amd.com>
---
 csrc/rocm/skinny_gemms.cu                     | 113 +++++++++++-------
 tests/kernels/quant_utils.py                  |  14 ++-
 .../layers/quantization/utils/w8a8_utils.py   |   4 +-
 vllm/model_executor/layers/utils.py           |   4 +-
 vllm/platforms/rocm.py                        |  10 +-
 5 files changed, 91 insertions(+), 54 deletions(-)

diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu
index b3717892d..e31aa0162 100644
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -13,14 +13,34 @@
 #include "dispatch_utils.h"
 #include "quantization/fp8/common.cuh"
 
-#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx942__))
-  #define __HIP__MI300_MI250__
+#if defined(__HIPCC__) && \
+    (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
+  #define __HIP__GFX9__
 #endif
 
-#if defined(__HIPCC__) && defined(__gfx942__)
-  #define __HIP__MI300__
+#if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__))
+  #define __HIP__MI3XX__
 #endif
 
+#if defined(__gfx950__)
+  #define LDS_SIZE 160 * 1024
+#else
+  #define LDS_SIZE 64 * 1024
+#endif
+
+int get_lds_size() {
+  static bool is_cached = false;
+  static int result;
+  if (is_cached == false) {
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    std::string device_arch = dprops->gcnArchName;
+    size_t substring = device_arch.find("gfx95");
+    result = (substring == std::string::npos ? 64 * 1024 : 160 * 1024);
+    is_cached = true;
+  }
+  return result;
+}
+
 #if defined(NDEBUG)
   #undef NDEBUG
   #include <assert.h>
@@ -267,7 +287,7 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
     V0 += (s.x + s.y);                                                        \
   }
 
-#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+#if defined(__HIP__GFX9__)  // TODO: Add NAVI support
 // This version targets cases where A[] fits LDS capacity
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
@@ -275,7 +295,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
                      const scalar_t* __restrict__ A, scalar_t* C,
                      const int _WvPrGrp, const int CuCount) {
-  #if defined(__HIP__MI300__)
+  constexpr int max_lds_len = LDS_SIZE / 2;
+  #if defined(__HIP__MI3XX__)
   constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
   #else
   constexpr bool use_mfma = false;
@@ -295,13 +316,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   };
 
   //----------------------------------------------------
-  // Reserving 64 KB of LDS to have 1 WG / CU
+  // Reserving 64/160 KB of LDS to have 1 WG / CU
   // Goal is to bring the activation matrix A to the LDS
   // and use it across the lifetime of the work group
   // TODO: When activation matrix is larger than 64 KB
   //	     then this is not goint to work!
   //----------------------------------------------------
-  __shared__ scalar_t s[1024 * 32];
+  __shared__ scalar_t s[max_lds_len];
 
   //----------------------------------------------------
   // Fetch the activation matrix to LDS
@@ -312,11 +333,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   // - Then the WG will move to another 8 K elements
   // TODO: Logic below will only work when K is multiple of 8
   //----------------------------------------------------
-  for (uint32_t k = 0; k < min(K * N, 32 * 1024);
+  for (uint32_t k = 0; k < min(K * N, max_lds_len);
        k += THRDS * WvPrGrp * A_CHUNK) {
     uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
 
-    if (k_in >= min(K * N, 32 * 1024)) break;
+    if (k_in >= min(K * N, max_lds_len)) break;
 
     *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
   }
@@ -517,7 +538,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     m += CuCount * _WvPrGrp * YTILE;
   }
 }
-#else   // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+#else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
 __global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
@@ -525,9 +546,9 @@ __global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
                                  const int _WvPrGrp, const int CuCount) {
   UNREACHABLE_CODE
 }
-#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
 
-#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+#if defined(__HIP__GFX9__)  // TODO: Add NAVI support
 // This version targets cases where A[] marginally exceeds LDS capacity
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
@@ -535,7 +556,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     wvSplitK_hf_(const int K, const int M, const scalar_t* B,
                  const scalar_t* __restrict__ A, scalar_t* C,
                  const int _WvPrGrp, const int CuCount) {
-  #if defined(__HIP__MI300__)
+  constexpr int max_lds_len = LDS_SIZE / 2;
+  #if defined(__HIP__MI3XX__)
   constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
   #else
   constexpr bool use_mfma = false;
@@ -561,7 +583,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   // TODO: When activation matrix is larger than 64 KB
   //	     then this is not goint to work!
   //----------------------------------------------------
-  __shared__ scalar_t s[1024 * 32];
+  __shared__ scalar_t s[max_lds_len];
 
   //----------------------------------------------------
   // Computation of columns that need to be committed to memory!
@@ -598,11 +620,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   // - Then the WG will move to another 8 K elements
   // TODO: Logic below will only work when K is multiple of 8
   //----------------------------------------------------
-  for (uint32_t k = 0; k < min(K * N, 32 * 1024);
+  for (uint32_t k = 0; k < min(K * N, max_lds_len);
        k += THRDS * WvPrGrp * A_CHUNK) {
     uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
 
-    if (k_in >= min(K * N, 32 * 1024)) break;
+    if (k_in >= min(K * N, max_lds_len)) break;
 
     *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
   }
@@ -686,7 +708,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
         // Fetch A activation matrix in interleaved fashion from LDS or memory
 
         for (int n = 0; n < N; n++) {
-          if (k_ + K * n < 32 * 1024)
+          if (k_ + K * n < max_lds_len)
             bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
           else
             bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
@@ -817,7 +839,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   }
 }
 
-#else   // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+#else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
 __global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
@@ -825,9 +847,9 @@ __global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
                              const int _WvPrGrp, const int CuCount) {
   UNREACHABLE_CODE
 }
-#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
 
-#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+#if defined(__HIP__GFX9__)  // TODO: Add NAVI support
 // This version targets big A[] cases, where it is much larger than LDS capacity
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
@@ -835,7 +857,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
                      const scalar_t* __restrict__ A, scalar_t* C,
                      const int _WvPrGrp, const int CuCount) {
-  #if defined(__HIP__MI300__)
+  constexpr int max_lds_len = LDS_SIZE / 2;
+  #if defined(__HIP__MI3XX__)
   constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
   #else
   constexpr bool use_mfma = false;
@@ -855,13 +878,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   };
 
   //----------------------------------------------------
-  // Reserving 64 KB of LDS to have 1 WG / CU
+  // Reserving 64/160 KB of LDS to have 1 WG / CU
   // Goal is to bring the activation matrix A to the LDS
   // and use it across the lifetime of the work group
   // TODO: When activation matrix is larger than 64 KB
   //	     then this is not goint to work!
   //----------------------------------------------------
-  __shared__ scalar_t s[1024 * 32];
+  __shared__ scalar_t s[max_lds_len];
 
   //----------------------------------------------------
   // Computation of columns that need to be committed to memory!
@@ -902,11 +925,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   //----------------------------------------------------
   #define PCML
   #ifndef PCML
-  for (uint32_t k = 0; k < min(K * N, 32 * 1024);
+  for (uint32_t k = 0; k < min(K * N, max_lds_len);
        k += THRDS * WvPrGrp * A_CHUNK) {
     uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
 
-    if (k_in >= min(K * N, 32 * 1024)) break;
+    if (k_in >= min(K * N, max_lds_len)) break;
 
     *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
   }
@@ -916,7 +939,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   #define TUC (THRDS * UNRL * A_CHUNK)
   uint32_t kBase = 0;
   // find biggest k size that fits in LDS
-  uint32_t kFit = (32 * 1024) / N;
+  uint32_t kFit = (max_lds_len) / N;
   // kFit = (kFit%TWC==0) ? kFit : (kFit-kFit%TWC+TWC); //round up to multiple
   // of TUC
   kFit = (kFit % TUC == 0)
@@ -1164,7 +1187,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     }
   }
 }
-#else   // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+#else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
 __global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
@@ -1172,7 +1195,7 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
                                  const int _WvPrGrp, const int CuCount) {
   UNREACHABLE_CODE
 }
-#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
 
 int mindiv(int N, int div1, int div2) {
   int nPrRnd = div1 * div2;
@@ -1222,17 +1245,18 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
 
   const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int max_lds_len = get_lds_size() / 2;
 
 #define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
                  _N)                                                          \
   {                                                                           \
     dim3 block(64, _WvPrGrp);                                                 \
-    if ((K_in * N_in <= 32 * 1024) && (M_in % _YTILEs == 0)) {                \
+    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {              \
       int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);              \
       wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N>          \
           <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
                                        CuCount);                              \
-    } else if (K_in * N_in <= 32 * 1024 * 1.2) {                              \
+    } else if (K_in * N_in <= max_lds_len * 1.2) {                            \
       int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);              \
       wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N>              \
           <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
@@ -1272,7 +1296,7 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
   return out_c;
 }
 
-#if defined(__HIP__MI300__)  // TODO: Add NAVI support
+#if defined(__HIP__MI3XX__)  // TODO: Add NAVI support
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
           int A_CHUNK, int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
@@ -1281,6 +1305,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
                       const float* __restrict__ s_A,
                       const float* __restrict__ s_B, const int _WvPrGrp,
                       const int CuCount) {
+  constexpr int max_lds_len = LDS_SIZE;
   using scalar8 =
       __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
   using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
@@ -1296,10 +1321,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     scalar8 h8;
   };
 
-  __shared__ fp8_t s[1024 * 64];
+  __shared__ fp8_t s[max_lds_len];
 
   for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
-       k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
+       k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
     *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
   }
   __syncthreads();
@@ -1436,7 +1461,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     m += CuCount * _WvPrGrp * YTILE;
   }
 }
-#else   // !defined(__HIP__MI300__) TODO: Add NAVI support
+#else   // !defined(__HIP__MI3XX__) TODO: Add NAVI support
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
           int A_CHUNK, int UNRL, int N>
 __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
@@ -1446,9 +1471,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
                                   const int _WvPrGrp, const int CuCount) {
   UNREACHABLE_CODE
 }
-#endif  // defined(__HIP__MI300__) TODO: Add NAVI support
+#endif  // defined(__HIP__MI3XX__) TODO: Add NAVI support
 
-#if defined(__HIP__MI300__)  // TODO: Add NAVI support
+#if defined(__HIP__MI3XX__)  // TODO: Add NAVI support
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
           int A_CHUNK, int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
@@ -1456,6 +1481,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
                   const fp8_t* __restrict__ A, scalar_t* C,
                   const float* __restrict__ s_A, const float* __restrict__ s_B,
                   const int _WvPrGrp, const int CuCount) {
+  constexpr int max_lds_len = LDS_SIZE;
   using scalar8 =
       __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
   using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
@@ -1471,10 +1497,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     scalar8 h8;
   };
 
-  __shared__ fp8_t s[1024 * 64];
+  __shared__ fp8_t s[max_lds_len];
 
   for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
-       k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
+       k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
     *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
   }
   __syncthreads();
@@ -1517,7 +1543,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
         uint32_t k_ = k + threadIdx.x * A_CHUNK;
         if (k_ >= K) break;
         for (int n = 0; n < N; n++) {
-          if (k_ + K * n < 64 * 1024)
+          if (k_ + K * n < max_lds_len)
             bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
           else
             bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
@@ -1608,7 +1634,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     m += CuCount * _WvPrGrp * YTILE;
   }
 }
-#else   // !defined(__HIP__MI300__) TODO: Add NAVI support
+#else   // !defined(__HIP__MI3XX__) TODO: Add NAVI support
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
           int A_CHUNK, int UNRL, int N>
 __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
@@ -1618,7 +1644,7 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
                               const int CuCount) {
   UNREACHABLE_CODE
 }
-#endif  // defined(__HIP__MI300__) TODO: Add NAVI support
+#endif  // defined(__HIP__MI3XX__) TODO: Add NAVI support
 
 void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
                at::Tensor& scale_a, at::Tensor& scale_b,
@@ -1638,12 +1664,13 @@ void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
   dim3 grid(CuCount);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int max_lds_len = get_lds_size();
 
 #define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
                   _N)                                                          \
   {                                                                            \
     dim3 block(64, _WvPrGrp);                                                  \
-    if ((K_in * N_in <= 64 * 1024) && (M_in % _YTILEs == 0)) {                 \
+    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {               \
       int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);               \
       wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N>  \
           <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index 764924f26..892309a01 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -8,7 +8,7 @@ from vllm.platforms import current_platform
 
 # Using the default value (240.0) from pytorch will cause accuracy
 # issue on dynamic quantization models. Here use 224.0 for rocm.
-ROCM_FP8_MAX = 224.0
+ROCM_FP8FNUZ_MAX = 224.0
 FP8_DTYPE = current_platform.fp8_dtype()
 
 
@@ -26,9 +26,11 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
 
     qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \
             else torch.finfo(quant_dtype)
-    qtype_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \
+    qtype_traits_max = ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
+                                            and current_platform.is_fp8_fnuz() \
                                         else qtype_traits.max
-    qtype_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \
+    qtype_traits_min = -ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
+                                            and current_platform.is_fp8_fnuz() \
                                         else qtype_traits.min
     qtype_max = as_float32_tensor(qtype_traits_max)
     s_1 = as_float32_tensor(1.0)
@@ -70,9 +72,11 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
                     -> tuple[torch.tensor, torch.tensor]:
 
     fp8_traits = torch.finfo(FP8_DTYPE)
-    fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \
+    fp8_traits_max = ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
+                                            and current_platform.is_fp8_fnuz() \
                                     else fp8_traits.max
-    fp8_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \
+    fp8_traits_min = -ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
+                                            and current_platform.is_fp8_fnuz() \
                                     else fp8_traits.min
     fp8_max = as_float32_tensor(fp8_traits_max)
     one = as_float32_tensor(1.0)
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 4b041cff2..eed8998fe 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -155,8 +155,8 @@ def rocm_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
                                    scale_b: torch.Tensor, bias: torch.Tensor,
                                    input_2d: torch.Tensor,
                                    output_shape: list) -> torch.Tensor:
-    from vllm.platforms.rocm import on_mi250_mi300
-    if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi250_mi300(
+    from vllm.platforms.rocm import on_mi3xx
+    if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi3xx(
     ) and qinput.shape[0] == 1 and qinput.shape[1] % 16 == 0:
         output = ops.wvSplitKQ(weight.t(), qinput, out_dtype, scale_a, scale_b,
                                current_platform.get_cu_count())
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index 18783d0d7..001e6aaf0 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -70,9 +70,9 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
 def rocm_unquantized_gemm(x: torch.Tensor,
                           weight: torch.Tensor,
                           bias: Optional[torch.Tensor] = None):
-    from vllm.platforms.rocm import on_mi250_mi300
+    from vllm.platforms.rocm import on_gfx9
     k = weight.shape[1]
-    use_skinny = (envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi250_mi300() and \
+    use_skinny = (envs.VLLM_ROCM_USE_SKINNY_GEMM and on_gfx9() and \
                     x.dtype in [torch.float16, torch.bfloat16] \
                     and k % 8 == 0 and bias is None)
 
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 06ee8614d..ef1c632a5 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -105,9 +105,15 @@ def on_gfx1x() -> bool:
 
 
 @cache
-def on_mi250_mi300() -> bool:
+def on_mi3xx() -> bool:
     GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
-    return any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942"])
+    return any(arch in GPU_ARCH for arch in ["gfx942", "gfx950"])
+
+
+@cache
+def on_gfx9() -> bool:
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    return any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
 
 
 @cache
-- 
GitLab


From 8bf507d766894a8104946e026a2eeb84cbf34242 Mon Sep 17 00:00:00 2001
From: ptarasiewiczNV <104908264+ptarasiewiczNV@users.noreply.github.com>
Date: Sat, 31 May 2025 17:19:18 +0200
Subject: [PATCH 097/274] [P/D] NixlConnector use cache device index for memory
 registration (#18969)

Signed-off-by: Piotr Tarasiewicz <ptarasiewicz@nvidia.com>
---
 vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index f02434aeb..6a3472157 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -488,7 +488,8 @@ class NixlConnectorWorker:
             for cache in cache_list:
                 base_addr = cache.data_ptr()
                 region_len = self.num_blocks * self.block_len
-                caches_data.append((base_addr, region_len, self.rank, ""))
+                caches_data.append(
+                    (base_addr, region_len, cache.device.index, ""))
                 kv_caches_base_addr.append(base_addr)
         self.kv_caches_base_addr[self.engine_id] = kv_caches_base_addr
         self.num_regions = len(caches_data)
-- 
GitLab


From 9a1b9b99d7d21a65280fc68c82f5bb6152fdf9dd Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sat, 31 May 2025 08:34:52 -0700
Subject: [PATCH 098/274] [BugFix] Fix multi-node offline data-parallel
 (#18981)

Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
---
 examples/offline_inference/data_parallel.py | 12 ++++++++----
 vllm/v1/engine/core_client.py               |  7 +++----
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index bf60d883c..15906e1a2 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -97,10 +97,14 @@ def main(
     # with DP, each rank should process different prompts.
     # usually all the DP ranks process a full dataset,
     # and each rank processes a different part of the dataset.
-    promts_per_rank = len(prompts) // dp_size
-    start = global_dp_rank * promts_per_rank
-    end = start + promts_per_rank
-    prompts = prompts[start:end]
+    floor = len(prompts) // dp_size
+    remainder = len(prompts) % dp_size
+
+    # Distribute prompts into even groups.
+    def start(rank):
+        return rank * floor + min(rank, remainder)
+
+    prompts = prompts[start(global_dp_rank) : start(global_dp_rank + 1)]
     if len(prompts) == 0:
         # if any rank has no prompts to process,
         # we need to set a placeholder prompt
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index e9e2d2d8d..232d6742b 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -363,6 +363,7 @@ class MPClient(EngineCoreClient):
             local_engine_count = parallel_config.data_parallel_size_local
             local_start_index = parallel_config.data_parallel_rank_local
             dp_size = parallel_config.data_parallel_size
+            dp_rank = parallel_config.data_parallel_rank
 
             # SPMD mode is where there is an LLM instance per DP rank and
             # one core engine per LLM, see
@@ -370,11 +371,9 @@ class MPClient(EngineCoreClient):
             spmd_mode = local_start_index is not None
             if spmd_mode:
                 assert local_engine_count == 1
-                self.core_engines = [
-                    CoreEngine(index=local_start_index, local=True)
-                ]
+                self.core_engines = [CoreEngine(index=dp_rank, local=True)]
             else:
-                assert parallel_config.data_parallel_rank == 0
+                assert dp_rank == 0
                 local_start_index = 0
                 self.core_engines = [
                     CoreEngine(index=i, local=(i < local_engine_count))
-- 
GitLab


From 20079c6e365752713c05de0eb719ac0307979c3b Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sun, 1 Jun 2025 02:00:11 +0800
Subject: [PATCH 099/274] [Misc] add return token strs for tokenize (#18941)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 tests/entrypoints/openai/test_tokenization.py | 65 ++++++++++++++-----
 vllm/entrypoints/openai/protocol.py           | 11 ++++
 .../openai/serving_tokenization.py            |  5 ++
 3 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 9773f3e45..7d823542e 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -76,11 +76,11 @@ async def test_tokenize_completions(
                                  })
         response.raise_for_status()
 
-        assert response.json() == {
-            "tokens": tokens,
-            "count": len(tokens),
-            "max_model_len": 8192
-        }
+        result = response.json()
+        assert result["tokens"] == tokens
+        assert result["count"] == len(tokens)
+        assert result["max_model_len"] == 8192
+        assert result["token_strs"] is None
 
 
 @pytest.mark.asyncio
@@ -138,11 +138,11 @@ async def test_tokenize_chat(
                                          })
                 response.raise_for_status()
 
-                assert response.json() == {
-                    "tokens": tokens,
-                    "count": len(tokens),
-                    "max_model_len": 8192
-                }
+                result = response.json()
+                assert result["tokens"] == tokens
+                assert result["count"] == len(tokens)
+                assert result["max_model_len"] == 8192
+                assert result["token_strs"] is None
 
 
 @pytest.mark.asyncio
@@ -215,11 +215,46 @@ async def test_tokenize_chat_with_tools(
                 )
                 response.raise_for_status()
 
-                assert response.json() == {
-                    "tokens": tokens,
-                    "count": len(tokens),
-                    "max_model_len": 8192,
-                }
+                result = response.json()
+                assert result["tokens"] == tokens
+                assert result["count"] == len(tokens)
+                assert result["max_model_len"] == 8192
+                assert result["token_strs"] is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name, tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenize_with_return_token_strs(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
+
+    prompt = "This is a token_strs test prompt! vllm1"
+    response = requests.post(
+        server.url_for("tokenize"),
+        json={
+            "prompt": prompt,
+            "model": model_name,
+            "return_token_strs": True
+        },
+    )
+    response.raise_for_status()
+
+    tokens = tokenizer.encode(prompt, add_special_tokens=True)
+    tokens_str = tokenizer.convert_ids_to_tokens(tokens)
+
+    result = response.json()
+    assert result["tokens"] == tokens
+    assert result["count"] == len(tokens)
+    assert result["max_model_len"] == 8192
+    assert result["token_strs"] == tokens_str
 
 
 @pytest.mark.asyncio
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 2f641079e..e72c23993 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1563,6 +1563,11 @@ class TokenizeCompletionRequest(OpenAIBaseModel):
             "If true (the default), special tokens (e.g. BOS) will be added to "
             "the prompt."),
     )
+    return_token_strs: Optional[bool] = Field(
+        default=False,
+        description=("If true, also return the token strings "
+                     "corresponding to the token ids."),
+    )
 
 
 class TokenizeChatRequest(OpenAIBaseModel):
@@ -1576,6 +1581,11 @@ class TokenizeChatRequest(OpenAIBaseModel):
          "This is a parameter used by chat template in tokenizer config of the "
          "model."),
     )
+    return_token_strs: Optional[bool] = Field(
+        default=False,
+        description=("If true, also return the token strings "
+                     "corresponding to the token ids."),
+    )
     continue_final_message: bool = Field(
         default=False,
         description=
@@ -1633,6 +1643,7 @@ class TokenizeResponse(OpenAIBaseModel):
     count: int
     max_model_len: int
     tokens: list[int]
+    token_strs: Optional[list[str]] = None
 
 
 class DetokenizeRequest(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 5ef1a486d..0d739bbf9 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -110,7 +110,12 @@ class OpenAIServingTokenization(OpenAIServing):
                           dict) and "prompt_token_ids" in engine_prompt:
                 input_ids.extend(engine_prompt["prompt_token_ids"])
 
+        token_strs = None
+        if request.return_token_strs:
+            token_strs = tokenizer.convert_ids_to_tokens(input_ids)
+
         return TokenizeResponse(tokens=input_ids,
+                                token_strs=token_strs,
                                 count=len(input_ids),
                                 max_model_len=self.max_model_len)
 
-- 
GitLab


From bbfa0c61d195968dd0e9a6169243019109098a2c Mon Sep 17 00:00:00 2001
From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Date: Sat, 31 May 2025 15:07:38 -0400
Subject: [PATCH 100/274] [Misc][Benchmark] Add support for CustomDataset
 (#18511)

---
 benchmarks/README.md            | 48 ++++++++++++++++
 benchmarks/benchmark_dataset.py | 94 +++++++++++++++++++++++++++++++-
 benchmarks/benchmark_serving.py | 30 +++++++++-
 vllm/benchmarks/datasets.py     | 97 ++++++++++++++++++++++++++++++++-
 vllm/benchmarks/serve.py        |  3 +
 5 files changed, 264 insertions(+), 8 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index cbf2f281b..6f9fbb91c 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -64,6 +64,12 @@ become available.
       <td style="text-align: center;">✅</td>
       <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
     </tr>
+    <tr>
+      <td><strong>Custom</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>Local file: <code>data.jsonl</code></td>
+    </tr>
   </tbody>
 </table>
 
@@ -124,6 +130,38 @@ P99 ITL (ms):                            8.39
 ==================================================
 ```
 
+### Custom Dataset
+If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
+
+```
+{"prompt": "What is the capital of India?"}
+{"prompt": "What is the capital of Iran?"}
+{"prompt": "What is the capital of China?"}
+``` 
+
+```bash
+# start server
+VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
+```
+
+```bash
+# run benchmarking script
+python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
+  --backend vllm \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --endpoint /v1/completions \
+  --dataset-name custom \
+  --dataset-path <path-to-your-data-jsonl> \
+  --custom-skip-chat-template \
+  --num-prompts 80 \
+  --max-concurrency 1 \
+  --temperature=0.3 \
+  --top-p=0.75 \
+  --result-dir "./log/"
+```
+
+You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
+
 ### VisionArena Benchmark for Vision Language Models
 
 ```bash
@@ -203,6 +241,16 @@ python3 vllm/benchmarks/benchmark_serving.py \
     --seed 42
 ```
 
+**`philschmid/mt-bench`**
+
+``` bash
+python3 vllm/benchmarks/benchmark_serving.py \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path philschmid/mt-bench \
+    --num-prompts 80
+```
+
 ### Running With Sampling Parameters
 
 When using OpenAI-compatible backends such as `vllm`, optional sampling
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 5513a5f78..d86bf045e 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -9,9 +9,6 @@ generation. Supported dataset types include:
   - BurstGPT
   - HuggingFace
   - VisionArena
-
-TODO: Implement CustomDataset to parse a JSON file and convert its contents into
-SampleRequest instances, similar to the approach used in ShareGPT.
 """
 
 import base64
@@ -442,6 +439,97 @@ class ShareGPTDataset(BenchmarkDataset):
         return samples
 
 
+# -----------------------------------------------------------------------------
+# Custom Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class CustomDataset(BenchmarkDataset):
+    """
+    Implements the Custom dataset.  Loads data from a JSONL file and generates
+    sample requests based on conversation turns. E.g.,
+    ```
+    {"prompt": "What is the capital of India?"}
+    {"prompt": "What is the capital of Iran?"}
+    {"prompt": "What is the capital of China?"}
+    ```
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        # self.data will be a list of dictionaries
+        # e.g., [{"prompt": "What is the capital of India?"}, ...]
+        # This will be the standardized format which load_data()
+        # has to convert into depending on the filetype of dataset_path.
+        # sample() will assume this standardized format of self.data
+        self.data = []
+
+        # Load the JSONL file
+        if self.dataset_path.endswith(".jsonl"):
+            jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
+
+            # check if the JSONL file has a 'prompt' column
+            if "prompt" not in jsonl_data.columns:
+                raise ValueError("JSONL file must contain a 'prompt' column.")
+
+            # Convert each row to a dictionary and append to self.data
+            # This will convert the DataFrame to a list of dictionaries
+            # where each dictionary corresponds to a row in the DataFrame.
+            # This is the standardized format we want for self.data
+            for _, row in jsonl_data.iterrows():
+                self.data.append(row.to_dict())
+        else:
+            raise NotImplementedError(
+                "Only JSONL format is supported for CustomDataset."
+            )
+
+        random.seed(self.random_seed)
+        random.shuffle(self.data)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        skip_chat_template: bool = False,
+        **kwargs,
+    ) -> list:
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["prompt"]
+
+            # apply template
+            if not skip_chat_template:
+                prompt = tokenizer.apply_chat_template(
+                    [{"role": "user", "content": prompt}],
+                    add_generation_prompt=True,
+                    tokenize=False,
+                )
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+
+        return sampled_requests
+
+
 # -----------------------------------------------------------------------------
 # Sonnet Dataset Implementation
 # -----------------------------------------------------------------------------
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 79024a9d6..6bd9f1b49 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -60,6 +60,7 @@ from benchmark_dataset import (
     ASRDataset,
     BurstGPTDataset,
     ConversationDataset,
+    CustomDataset,
     HuggingFaceDataset,
     InstructCoderDataset,
     MTBenchDataset,
@@ -627,7 +628,16 @@ def main(args: argparse.Namespace):
             "'--dataset-path' if required."
         )
 
-    if args.dataset_name == "sonnet":
+    if args.dataset_name == "custom":
+        dataset = CustomDataset(dataset_path=args.dataset_path)
+        input_requests = dataset.sample(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            output_len=args.custom_output_len,
+            skip_chat_template=args.custom_skip_chat_template,
+        )
+
+    elif args.dataset_name == "sonnet":
         dataset = SonnetDataset(dataset_path=args.dataset_path)
         # For the "sonnet" dataset, formatting depends on the backend.
         if args.backend == "openai-chat":
@@ -838,6 +848,8 @@ def main(args: argparse.Namespace):
             ]:
                 if field in result_json:
                     del result_json[field]
+                if field in benchmark_result:
+                    del benchmark_result[field]
 
         # Save to file
         base_model_id = model_id.split("/")[-1]
@@ -850,6 +862,7 @@ def main(args: argparse.Namespace):
         if args.result_filename:
             file_name = args.result_filename
         if args.result_dir:
+            os.makedirs(args.result_dir, exist_ok=True)
             file_name = os.path.join(args.result_dir, file_name)
         with open(
             file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
@@ -890,7 +903,7 @@ if __name__ == "__main__":
         "--dataset-name",
         type=str,
         default="sharegpt",
-        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
+        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
         help="Name of the dataset to benchmark on.",
     )
     parser.add_argument(
@@ -1060,6 +1073,19 @@ if __name__ == "__main__":
     )
 
     # group for dataset specific arguments
+    custom_group = parser.add_argument_group("custom dataset options")
+    custom_group.add_argument(
+        "--custom-output-len",
+        type=int,
+        default=256,
+        help="Number of output tokens per request, used only for custom dataset.",
+    )
+    custom_group.add_argument(
+        "--custom-skip-chat-template",
+        action="store_true",
+        help="Skip applying chat template to prompt, used only for custom dataset.",
+    )
+
     sonnet_group = parser.add_argument_group("sonnet dataset options")
     sonnet_group.add_argument(
         "--sonnet-input-len",
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 712e83528..35cc303f6 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -9,9 +9,6 @@ generation. Supported dataset types include:
   - BurstGPT
   - HuggingFace
   - VisionArena
-
-TODO: Implement CustomDataset to parse a JSON file and convert its contents into
-SampleRequest instances, similar to the approach used in ShareGPT.
 """
 import base64
 import io
@@ -26,6 +23,7 @@ from io import BytesIO
 from typing import Any, Callable, Optional, Union
 
 import numpy as np
+import pandas as pd
 from PIL import Image
 from transformers import PreTrainedTokenizerBase
 
@@ -443,6 +441,99 @@ class ShareGPTDataset(BenchmarkDataset):
         return samples
 
 
+# -----------------------------------------------------------------------------
+# Custom Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class CustomDataset(BenchmarkDataset):
+    """
+    Implements the Custom dataset.  Loads data from a JSONL file and generates
+    sample requests based on conversation turns. E.g.,
+    ```
+    {"prompt": "What is the capital of India?"}
+    {"prompt": "What is the capital of Iran?"}
+    {"prompt": "What is the capital of China?"}
+    ```
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        # self.data will be a list of dictionaries
+        # e.g., [{"prompt": "What is the capital of India?"}, ...]
+        # This will be the standardized format which load_data()
+        # has to convert into depending on the filetype of dataset_path.
+        # sample() will assume this standardized format of self.data
+        self.data = []
+
+        # Load the JSONL file
+        if self.dataset_path.endswith(".jsonl"):
+            jsonl_data = pd.read_json(path_or_buf=self.dataset_path,
+                                      lines=True)
+
+            # check if the JSONL file has a 'prompt' column
+            if "prompt" not in jsonl_data.columns:
+                raise ValueError("JSONL file must contain a 'prompt' column.")
+
+            # Convert each row to a dictionary and append to self.data
+            # This will convert the DataFrame to a list of dictionaries
+            # where each dictionary corresponds to a row in the DataFrame.
+            # This is the standardized format we want for self.data
+            for _, row in jsonl_data.iterrows():
+                self.data.append(row.to_dict())
+        else:
+            raise NotImplementedError(
+                "Only JSONL format is supported for CustomDataset.")
+
+        random.seed(self.random_seed)
+        random.shuffle(self.data)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        skip_chat_template: bool = False,
+        **kwargs,
+    ) -> list:
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["prompt"]
+
+            # apply template
+            if not skip_chat_template:
+                prompt = tokenizer.apply_chat_template(
+                    [{
+                        "role": "user",
+                        "content": prompt
+                    }],
+                    add_generation_prompt=True,
+                    tokenize=False,
+                )
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+
+        return sampled_requests
+
+
 # -----------------------------------------------------------------------------
 # Sonnet Dataset Implementation
 # -----------------------------------------------------------------------------
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 040815e87..858a0c6a0 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -1110,6 +1110,8 @@ def main(args: argparse.Namespace):
             ]:
                 if field in result_json:
                     del result_json[field]
+                if field in benchmark_result:
+                    del benchmark_result[field]
 
         # Save to file
         base_model_id = model_id.split("/")[-1]
@@ -1120,6 +1122,7 @@ def main(args: argparse.Namespace):
         if args.result_filename:
             file_name = args.result_filename
         if args.result_dir:
+            os.makedirs(args.result_dir, exist_ok=True)
             file_name = os.path.join(args.result_dir, file_name)
         with open(file_name,
                   mode="a+" if args.append_result else "w",
-- 
GitLab


From 1bc86a3da1bd45e7d43347d6532a515950a438f0 Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <benjamin.chislett@centml.ai>
Date: Sat, 31 May 2025 22:58:07 -0400
Subject: [PATCH 101/274] [Bugfix] Fix EAGLE3 broken logits (#18909)

Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
---
 vllm/model_executor/models/llama_eagle3.py | 23 +++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index f211bfe54..1e40017fc 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -215,6 +215,9 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
         logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         if self.draft_id_to_target_id is None:
+            assert logits.shape[1] == self.config.vocab_size, \
+                "Expected logits to have shape " \
+                f"(*, {self.config.vocab_size}), but got {logits.shape}"
             return logits
 
         base = torch.arange(self.config.draft_vocab_size, device=logits.device)
@@ -234,24 +237,22 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
         return self.model.fc(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-        loader = AutoWeightsLoader(
-            self,
-            skip_prefixes=None,
-        )
-
         model_weights = {}
+        includes_draft_id_mapping = False
         for name, loaded_weight in weights:
             if "t2d" in name:
                 continue
             if "d2t" in name:
                 name = name.replace("d2t", "draft_id_to_target_id")
+                includes_draft_id_mapping = True
             elif "lm_head" not in name:
                 name = "model." + name
             model_weights[name] = loaded_weight
 
-        loaded_weights = loader.load_weights(model_weights.items())
-
-        if 'd2t' not in loaded_weights:
-            self.draft_id_to_target_id = None
-
-        return loaded_weights
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=None,
+            skip_substrs=["draft_id_to_target_id"] \
+                if not includes_draft_id_mapping else None,
+        )
+        loader.load_weights(model_weights.items())
-- 
GitLab


From 6aa8f9a4e7ed7af459476afe4f293f383e35f3e8 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 1 Jun 2025 11:04:23 +0800
Subject: [PATCH 102/274] [Core] Rework dtype resolution (#18751)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../test_basic_correctness.py                 |   5 +-
 tests/conftest.py                             |   7 +-
 tests/models/language/pooling/mteb_utils.py   |  11 +-
 .../language/pooling/test_classification.py   |   2 +-
 .../models/language/pooling/test_embedding.py |   6 +-
 .../multimodal/generation/test_whisper.py     |   1 +
 .../multimodal/processing/test_common.py      |   2 +-
 tests/samplers/test_no_bad_words.py           |   2 +-
 tests/test_utils.py                           | 102 +++++++--
 vllm/config.py                                | 202 ++++++++++++------
 vllm/platforms/cpu.py                         |   2 +-
 vllm/transformers_utils/config.py             |  40 +++-
 vllm/utils.py                                 |  51 ++++-
 13 files changed, 314 insertions(+), 119 deletions(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 86b5e1e0a..11c8e7a4b 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -60,7 +60,6 @@ def _fix_prompt_embed_outputs(
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
-@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False])
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
@@ -69,7 +68,6 @@ def test_models(
     hf_runner,
     model: str,
     backend: str,
-    dtype: str,
     max_tokens: int,
     enforce_eager: bool,
     enable_prompt_embeds: bool,
@@ -97,7 +95,7 @@ def test_models(
             str(i) for i in range(1024)) + " are:"
         example_prompts = [prompt]
 
-        with hf_runner(model, dtype=dtype) as hf_model:
+        with hf_runner(model) as hf_model:
             hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
             if enable_prompt_embeds:
                 with torch.no_grad():
@@ -106,7 +104,6 @@ def test_models(
 
         with VllmRunner(model,
                         max_model_len=8192,
-                        dtype=dtype,
                         enforce_eager=enforce_eager,
                         enable_prompt_embeds=enable_prompt_embeds,
                         gpu_memory_utilization=0.7) as vllm_model:
diff --git a/tests/conftest.py b/tests/conftest.py
index 26674483f..6336c6c2c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -324,7 +324,12 @@ class HfRunner:
             trust_remote_code=trust_remote_code,
         )
         self.device = self.get_default_device()
-        self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
+        self.dtype = torch_dtype = _get_and_verify_dtype(
+            self.model_name,
+            self.config,
+            dtype=dtype,
+            is_pooling_model=is_sentence_transformer or is_cross_encoder,
+        )
 
         model_kwargs = model_kwargs if model_kwargs is not None else {}
         model_kwargs.setdefault("torch_dtype", torch_dtype)
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index f4837ae95..f45168bc0 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -102,21 +102,18 @@ def mteb_test_embed_models(hf_runner,
         vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                               MTEB_EMBED_TASKS)
         vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
-        model_dtype = getattr(
-            vllm_model.model.llm_engine.model_config.hf_config, "torch_dtype",
-            vllm_dtype)
 
-    with set_default_torch_dtype(model_dtype) and hf_runner(
+    with set_default_torch_dtype(vllm_dtype) and hf_runner(
             model_info.name, is_sentence_transformer=True,
-            dtype=model_dtype) as hf_model:
+            dtype=vllm_dtype) as hf_model:
 
         if hf_model_callback is not None:
             hf_model_callback(hf_model)
 
         st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
 
-    print("VLLM:", vllm_dtype, vllm_main_score)
-    print("SentenceTransformer:", model_dtype, st_main_score)
+    print("VLLM:", vllm_main_score)
+    print("SentenceTransformers:", st_main_score)
     print("Difference:", st_main_score - vllm_main_score)
 
     assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py
index 44af3df08..57b3cb58d 100644
--- a/tests/models/language/pooling/test_classification.py
+++ b/tests/models/language/pooling/test_classification.py
@@ -43,6 +43,6 @@ def test_models(
 
         # the tolerance value of 1e-2 is selected based on the
         # half datatype tests in
-        # tests/models/embedding/language/test_embedding.py
+        # tests/models/language/pooling/test_embedding.py
         assert torch.allclose(hf_output, vllm_output,
                               1e-3 if dtype == "float" else 1e-2)
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index 306cfdf37..8f82c8091 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -30,13 +30,11 @@ from ...utils import check_embeddings_close
         pytest.param("sentence-transformers/stsb-roberta-base-v2"),
     ],
 )
-@pytest.mark.parametrize("dtype", ["half"])
 def test_models(
     hf_runner,
     vllm_runner,
     example_prompts,
     model,
-    dtype: str,
     monkeypatch,
 ) -> None:
 
@@ -58,13 +56,11 @@ def test_models(
     # So we need to strip the input texts to avoid test failing.
     example_prompts = [str(s).strip() for s in example_prompts]
 
-    with hf_runner(model, dtype=dtype,
-                   is_sentence_transformer=True) as hf_model:
+    with hf_runner(model, is_sentence_transformer=True) as hf_model:
         hf_outputs = hf_model.encode(example_prompts)
 
     with vllm_runner(model,
                      task="embed",
-                     dtype=dtype,
                      max_model_len=None,
                      **vllm_extra_kwargs) as vllm_model:
         vllm_outputs = vllm_model.encode(example_prompts)
diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py
index 4e48bdbd0..d0b85842a 100644
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -100,6 +100,7 @@ def run_test(
 
     with vllm_runner(
             model,
+            dtype="half",
             max_model_len=448,
             tensor_parallel_size=tensor_parallel_size,
             distributed_executor_backend=distributed_executor_backend,
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 572fa366d..d7f950c23 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -40,7 +40,7 @@ def _test_processing_correctness(
         tokenizer_mode=model_info.tokenizer_mode,
         trust_remote_code=model_info.trust_remote_code,
         seed=0,
-        dtype="float16",
+        dtype="auto",
         revision=None,
         hf_overrides=model_info.hf_overrides,
     )
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index 355e3adcf..f9688b4b9 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -103,7 +103,7 @@ class TestTwoTokenBadWord:
                                                 add_special_tokens=False)[0]
 
     def test_two_token_bad_word(self, vllm_runner):
-        with vllm_runner(self.MODEL) as llm:
+        with vllm_runner(self.MODEL, dtype="half") as llm:
             output_token_ids = self._generate(llm)
             assert output_token_ids[:2] == [
                 self.target_token_id1, self.target_token_id2
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0b88d05ef..dd8777f06 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -17,7 +17,8 @@ from vllm_test_utils.monitor import monitor
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
                         MemorySnapshot, PlaceholderModule, StoreBoolean,
-                        bind_kv_cache, deprecate_kwargs, get_open_port,
+                        bind_kv_cache, common_broadcastable_dtype,
+                        deprecate_kwargs, get_open_port, is_lossless_cast,
                         make_zmq_path, make_zmq_socket, memory_profiling,
                         merge_async_iterators, sha256, split_zmq_path,
                         supports_kw, swap_dict_values)
@@ -567,12 +568,65 @@ def test_lru_cache():
     assert 6 in cache
 
 
+# yapf: disable
+@pytest.mark.parametrize(
+    ("src_dtype", "tgt_dtype", "expected_result"),
+    [
+        # Different precision_levels
+        (torch.bool, torch.int8, True),
+        (torch.bool, torch.float16, True),
+        (torch.bool, torch.complex32, True),
+        (torch.int64, torch.bool, False),
+        (torch.int64, torch.float16, True),
+        (torch.int64, torch.complex32, True),
+        (torch.float64, torch.bool, False),
+        (torch.float64, torch.int8, False),
+        (torch.float64, torch.complex32, True),
+        (torch.complex128, torch.bool, False),
+        (torch.complex128, torch.int8, False),
+        (torch.complex128, torch.float16, False),
+        # precision_level=0
+        (torch.bool, torch.bool, True),
+        # precision_level=1
+        (torch.int8, torch.int16, True),
+        (torch.int16, torch.int8, False),
+        (torch.uint8, torch.int8, False),
+        (torch.int8, torch.uint8, False),
+        # precision_level=2
+        (torch.float16, torch.float32, True),
+        (torch.float32, torch.float16, False),
+        (torch.bfloat16, torch.float32, True),
+        (torch.float32, torch.bfloat16, False),
+        # precision_level=3
+        (torch.complex32, torch.complex64, True),
+        (torch.complex64, torch.complex32, False),
+    ],
+)
+# yapf: enable
+def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
+    assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("dtypes", "expected_result"),
+    [
+        ([torch.bool], torch.bool),
+        ([torch.bool, torch.int8], torch.int8),
+        ([torch.bool, torch.int8, torch.float16], torch.float16),
+        ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32),  # noqa: E501
+    ],
+)
+# yapf: enable
+def test_common_broadcastable_dtype(dtypes, expected_result):
+    assert common_broadcastable_dtype(dtypes) == expected_result
+
+
 def test_placeholder_module_error_handling():
     placeholder = PlaceholderModule("placeholder_1234")
 
     def build_ctx():
-        return pytest.raises(ModuleNotFoundError,
-                             match="No module named")
+        return pytest.raises(ModuleNotFoundError, match="No module named")
 
     with build_ctx():
         int(placeholder)
@@ -608,6 +662,7 @@ def test_placeholder_module_error_handling():
         _ = placeholder_attr.module
 
 
+# yapf: disable
 @pytest.mark.parametrize(
     "obj,key1,key2",
     [
@@ -618,6 +673,7 @@ def test_placeholder_module_error_handling():
         # Tests for both keys do not exist
         ({1: "a", 2: "b"}, 3, 4),
     ])
+# yapf: enable
 def test_swap_dict_values(obj, key1, key2):
     original_obj = obj.copy()
     swap_dict_values(obj, key1, key2)
@@ -631,19 +687,19 @@ def test_swap_dict_values(obj, key1, key2):
         assert key1 not in obj
 
 
-def test_model_specification(parser_with_config,
-                             cli_config_file,
+def test_model_specification(parser_with_config, cli_config_file,
                              cli_config_file_with_model):
     # Test model in CLI takes precedence over config
-    args = parser_with_config.parse_args([
-        'serve', 'cli-model', '--config', cli_config_file_with_model
-    ])
+    args = parser_with_config.parse_args(
+        ['serve', 'cli-model', '--config', cli_config_file_with_model])
     assert args.model_tag == 'cli-model'
     assert args.served_model_name == 'mymodel'
 
     # Test model from config file works
     args = parser_with_config.parse_args([
-        'serve', '--config', cli_config_file_with_model,
+        'serve',
+        '--config',
+        cli_config_file_with_model,
     ])
     assert args.model == 'config-model'
     assert args.served_model_name == 'mymodel'
@@ -654,17 +710,19 @@ def test_model_specification(parser_with_config,
 
     # Test using --model option raises error
     with pytest.raises(
-        ValueError,
-        match=(
-            "With `vllm serve`, you should provide the model as a positional "
-            "argument or in a config file instead of via the `--model` option."
-        ),
+            ValueError,
+            match=
+        ("With `vllm serve`, you should provide the model as a positional "
+         "argument or in a config file instead of via the `--model` option."),
     ):
         parser_with_config.parse_args(['serve', '--model', 'my-model'])
 
     # Test other config values are preserved
     args = parser_with_config.parse_args([
-        'serve', 'cli-model', '--config', cli_config_file_with_model,
+        'serve',
+        'cli-model',
+        '--config',
+        cli_config_file_with_model,
     ])
     assert args.tensor_parallel_size == 2
     assert args.trust_remote_code is True
@@ -673,7 +731,7 @@ def test_model_specification(parser_with_config,
 
 
 @pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
-                                    (None, bool, [1, 2, 3])])
+                                   (None, bool, [1, 2, 3])])
 @pytest.mark.parametrize("output", [0, 1, 2])
 def test_sha256(input: tuple, output: int):
     hash = sha256(input)
@@ -682,7 +740,8 @@ def test_sha256(input: tuple, output: int):
     assert hash != 0
 
     bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
-    assert hash == int.from_bytes(hashlib.sha256(bytes).digest(), byteorder="big")
+    assert hash == int.from_bytes(hashlib.sha256(bytes).digest(),
+                                  byteorder="big")
 
     # hashing again, returns the same value
     assert hash == sha256(input)
@@ -698,8 +757,7 @@ def test_sha256(input: tuple, output: int):
         ("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")),
         ("tcp://[::1]:5555", ("tcp", "::1", "5555")),  # IPv6 address
         ("inproc://some_identifier", ("inproc", "some_identifier", "")),
-    ]
-)
+    ])
 def test_split_zmq_path(path, expected):
     assert split_zmq_path(path) == expected
 
@@ -711,8 +769,7 @@ def test_split_zmq_path(path, expected):
         "tcp://127.0.0.1",  # Missing port
         "tcp://[::1]",  # Missing port for IPv6
         "tcp://:5555",  # Missing host
-    ]
-)
+    ])
 def test_split_zmq_path_invalid(invalid_path):
     with pytest.raises(ValueError):
         split_zmq_path(invalid_path)
@@ -734,7 +791,8 @@ def test_make_zmq_socket_ipv6():
     zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type)
 
     # Verify that the IPV6 option is set
-    assert zsock.getsockopt(zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses"
+    assert zsock.getsockopt(
+        zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses"
 
     # Clean up
     zsock.close()
diff --git a/vllm/config.py b/vllm/config.py
index dfa44b044..f400e9875 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -24,6 +24,7 @@ import torch
 from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator,
                       model_validator)
 from pydantic.dataclasses import dataclass
+from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
 from torch.distributed import ProcessGroup, ReduceOp
 from transformers import PretrainedConfig
 from typing_extensions import deprecated, runtime_checkable
@@ -42,15 +43,16 @@ from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder,
-    try_get_generation_config, uses_mrope)
+    try_get_generation_config, try_get_safetensors_metadata, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
 from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
                         MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
                         POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes,
-                        LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, get_open_port, is_torch_equal_or_newer,
-                        random_uuid, resolve_obj_by_qualname)
+                        LayerBlockType, common_broadcastable_dtype,
+                        cuda_device_count_stateless, get_cpu_memory,
+                        get_open_port, is_torch_equal_or_newer, random_uuid,
+                        resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
     from _typeshed import DataclassInstance
@@ -540,7 +542,24 @@ class ModelConfig:
         self.encoder_config = self._get_encoder_config()
         self.hf_image_processor_config = get_hf_image_processor_config(
             self.model, hf_token=self.hf_token, revision=self.revision)
-        self.dtype = _get_and_verify_dtype(self.hf_config, self.dtype)
+
+        supported_tasks, task = self._resolve_task(self.task)
+        self.supported_tasks = supported_tasks
+        self.task = task
+        if self.task in ("draft", "generate"):
+            self.truncation_side = "left"
+        else:
+            self.truncation_side = "right"
+
+        self.pooler_config = self._init_pooler_config()
+
+        self.dtype = _get_and_verify_dtype(
+            self.model,
+            self.hf_config,
+            self.dtype,
+            is_pooling_model=self.runner_type == "pooling",
+            revision=self.revision,
+        )
 
         # Workaround for Gemma 2 which uses interleaved sliding window
         # attention, but it's not specified in its config. TODO: remove this
@@ -597,16 +616,6 @@ class ModelConfig:
             raise ValueError(
                 "`override_neuron_config` is only supported on Neuron.")
 
-        supported_tasks, task = self._resolve_task(self.task)
-        self.supported_tasks = supported_tasks
-        self.task = task
-        if self.task in ("draft", "generate"):
-            self.truncation_side = "left"
-        else:
-            self.truncation_side = "right"
-
-        self.pooler_config = self._init_pooler_config()
-
         self._verify_quantization()
         self._verify_cuda_graph()
         self._verify_bnb_config()
@@ -692,7 +701,6 @@ class ModelConfig:
             self.model, self.revision)
 
     def _init_pooler_config(self) -> Optional["PoolerConfig"]:
-
         if self.runner_type == "pooling":
             if isinstance(self.override_pooler_config, dict):
                 self.override_pooler_config = PoolerConfig(
@@ -3074,13 +3082,37 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
     "bfloat16": torch.bfloat16,
 }
 
-_ROCM_NOT_SUPPORTED_DTYPE: list[str] = []  #
+# model_type -> reason
+_FLOAT16_NOT_SUPPORTED_MODELS = {
+    "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.",
+    "gemma3": "Numerical instability. Please use bfloat16 or float32 instead.",
+    "plamo2": "Numerical instability. Please use bfloat16 or float32 instead.",
+    "glm4": "Numerical instability. Please use bfloat16 or float32 instead.",
+}
 
 
-def _get_and_verify_dtype(
+def _is_valid_dtype(model_type: str, dtype: torch.dtype):
+    if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16:  # noqa: E501, SIM103
+        return False
+
+    return True
+
+
+def _check_valid_dtype(model_type: str, dtype: torch.dtype):
+    if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16:
+        reason = _FLOAT16_NOT_SUPPORTED_MODELS[model_type]
+        raise ValueError(f"The model type {model_type!r} "
+                         f"does not support float16. Reason: {reason}")
+
+    return True
+
+
+def _find_dtype(
+    model_id: str,
     config: PretrainedConfig,
-    dtype: Union[str, torch.dtype],
-) -> torch.dtype:
+    *,
+    revision: Optional[str],
+):
     # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
     # because config.torch_dtype can be None.
     config_dtype = getattr(config, "torch_dtype", None)
@@ -3092,75 +3124,111 @@ def _get_and_verify_dtype(
     if config_dtype is None and hasattr(config, "vision_config"):
         config_dtype = getattr(config.vision_config, "torch_dtype", None)
 
+    # Try to read the dtype of the weights if they are in safetensors format
+    if config_dtype is None:
+        repo_mt = try_get_safetensors_metadata(model_id, revision=revision)
+
+        if repo_mt and (files_mt := repo_mt.files_metadata):
+            param_dtypes: set[torch.dtype] = {
+                _SAFETENSORS_TO_TORCH_DTYPE[dtype_str]
+                for file_mt in files_mt.values()
+                for dtype_str in file_mt.parameter_count
+                if dtype_str in _SAFETENSORS_TO_TORCH_DTYPE
+            }
+
+            if param_dtypes:
+                return common_broadcastable_dtype(param_dtypes)
+
     if config_dtype is None:
         config_dtype = torch.float32
 
-    if isinstance(dtype, str):
-        dtype = dtype.lower()
-        if dtype == "auto":
-            # Set default dtype from model config
-            if config_dtype == torch.float32:
-                # Following common practice, we use float16 for float32 models
-                torch_dtype = torch.float16
-            else:
-                torch_dtype = config_dtype
+    return config_dtype
 
-            if config.model_type == "plamo2":
-                logger.warning(
-                    "For PLaMo2, we cast models to bfloat16 instead of using "
-                    "float16 by default. This is because float16 does not work."
-                )
-                torch_dtype = torch.bfloat16
 
-            # Deal with torch dtype fallback for device compatibility.
-            from vllm.platforms import current_platform
-            if torch_dtype not in current_platform.supported_dtypes:
-                device_name = current_platform.get_device_name()
+def _resolve_auto_dtype(
+    model_type: str,
+    config_dtype: torch.dtype,
+    *,
+    is_pooling_model: bool,
+):
+    from vllm.platforms import current_platform
 
-                if ((capability := current_platform.get_device_capability())
-                        is None):
-                    compute_str = ""
-                else:
-                    version_str = capability.as_version_str()
-                    compute_str = f" (with compute capability {version_str})"
-                fallback_dtype = current_platform.supported_dtypes[0]
-                logger.warning(
-                    "Your %s device%s doesn't support %s. " \
-                    "Falling back to %s for compatibility.",
-                    device_name, compute_str, torch_dtype, fallback_dtype
-                    )
-                torch_dtype = fallback_dtype
+    supported_dtypes = [
+        dtype for dtype in current_platform.supported_dtypes
+        if _is_valid_dtype(model_type, dtype)
+    ]
 
-            if current_platform.is_hpu() and torch_dtype == torch.float16:
-                logger.warning(
-                    "For HPU, we cast models to bfloat16 instead of "
-                    "using float16 by default. Please specify `dtype` if you "
-                    "want to use float16.")
-                torch_dtype = torch.bfloat16
-        elif dtype == "float16" and config.model_type == "plamo2":
-            logger.warning(
-                "For PLaMo2, using float16 is unstable and might cause "
-                "unexpected behavior. Please use bfloat16 or float32 instead.")
-            torch_dtype = torch.float16
+    if is_pooling_model and torch.float16 in supported_dtypes:
+        preferred_dtype = torch.float16
+    else:
+        preferred_dtype = supported_dtypes[0]
+
+    # Downcast for float32 models
+    if config_dtype == torch.float32:
+        config_dtype = preferred_dtype
+
+    if config_dtype in supported_dtypes:
+        return config_dtype
+
+    # Ensure device compatibility
+    device_name = current_platform.get_device_name()
+    device_capability = current_platform.get_device_capability()
+
+    if device_capability is None:
+        device_str = f"{device_name!r}"
+    else:
+        version_str = device_capability.as_version_str()
+        device_str = f"{device_name!r} (with compute capability {version_str})"
+
+    logger.warning(
+        "Your device %s doesn't support %s. "
+        "Falling back to %s for compatibility.",
+        device_str,
+        config_dtype,
+        preferred_dtype,
+    )
+
+    return preferred_dtype
+
+
+def _get_and_verify_dtype(
+    model_id: str,
+    config: PretrainedConfig,
+    dtype: Union[str, torch.dtype],
+    *,
+    is_pooling_model: bool,
+    revision: Optional[str] = None,
+) -> torch.dtype:
+    config_dtype = _find_dtype(model_id, config, revision=revision)
+    model_type = config.model_type
+
+    if isinstance(dtype, str):
+        dtype = dtype.lower()
+        if dtype == "auto":
+            # Set default dtype from model config
+            torch_dtype = _resolve_auto_dtype(
+                model_type,
+                config_dtype,
+                is_pooling_model=is_pooling_model,
+            )
         else:
             if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
-                raise ValueError(f"Unknown dtype: {dtype}")
+                raise ValueError(f"Unknown dtype: {dtype!r}")
             torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
     elif isinstance(dtype, torch.dtype):
         torch_dtype = dtype
     else:
         raise ValueError(f"Unknown dtype: {dtype}")
 
-    # Verify the dtype.
+    _check_valid_dtype(model_type, torch_dtype)
+
     if torch_dtype != config_dtype:
         if torch_dtype == torch.float32:
             # Upcasting to float32 is allowed.
             logger.info("Upcasting %s to %s.", config_dtype, torch_dtype)
-            pass
         elif config_dtype == torch.float32:
             # Downcasting from float32 to float16 or bfloat16 is allowed.
             logger.info("Downcasting %s to %s.", config_dtype, torch_dtype)
-            pass
         else:
             # Casting between float16 and bfloat16 is allowed with a warning.
             logger.warning("Casting %s to %s.", config_dtype, torch_dtype)
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index c79c603c0..eaffaac78 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -28,7 +28,7 @@ class CpuPlatform(Platform):
     dispatch_key: str = "CPU"
 
     @property
-    def supported_dtypes(self) -> list:
+    def supported_dtypes(self) -> list[torch.dtype]:
         if self.get_cpu_architecture() == CpuArchEnum.POWERPC:
             return [torch.bfloat16, torch.float32]
         elif sys.platform.startswith(
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 2ed71a4d3..8774f95a2 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -4,12 +4,12 @@ import enum
 import json
 import os
 import time
-from functools import cache
+from functools import cache, partial
 from pathlib import Path
-from typing import Any, Callable, Literal, Optional, Union
+from typing import Any, Callable, Literal, Optional, TypeVar, Union
 
 import huggingface_hub
-from huggingface_hub import hf_hub_download
+from huggingface_hub import get_safetensors_metadata, hf_hub_download
 from huggingface_hub import list_repo_files as hf_list_repo_files
 from huggingface_hub import try_to_load_from_cache
 from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
@@ -93,10 +93,15 @@ class ConfigFormat(str, enum.Enum):
     MISTRAL = "mistral"
 
 
-def with_retry(func: Callable[[], Any],
-               log_msg: str,
-               max_retries: int = 2,
-               retry_delay: int = 2):
+_R = TypeVar("_R")
+
+
+def with_retry(
+    func: Callable[[], _R],
+    log_msg: str,
+    max_retries: int = 2,
+    retry_delay: int = 2,
+) -> _R:
     for attempt in range(max_retries):
         try:
             return func()
@@ -109,6 +114,8 @@ def with_retry(func: Callable[[], Any],
             time.sleep(retry_delay)
             retry_delay *= 2
 
+    raise AssertionError("Should not be reached")
+
 
 # @cache doesn't cache exceptions
 @cache
@@ -840,3 +847,22 @@ def get_cross_encoder_activation_function(config: PretrainedConfig):
         return resolve_obj_by_qualname(function_name)()
     else:
         return nn.Sigmoid() if config.num_labels == 1 else nn.Identity()
+
+
+def try_get_safetensors_metadata(
+    model: str,
+    *,
+    revision: Optional[str] = None,
+):
+    get_safetensors_metadata_partial = partial(
+        get_safetensors_metadata,
+        model,
+        revision=revision,
+        token=os.getenv('HF_TOKEN', None),
+    )
+
+    try:
+        return with_retry(get_safetensors_metadata_partial,
+                          "Error retrieving safetensors")
+    except Exception:
+        return None
diff --git a/vllm/utils.py b/vllm/utils.py
index 65d3579d5..c879b38d0 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -37,8 +37,8 @@ from argparse import (Action, ArgumentDefaultsHelpFormatter, ArgumentParser,
                       _ArgumentGroup)
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
 from collections import UserDict, defaultdict
-from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable,
-                             Iterable, Iterator, KeysView, Mapping)
+from collections.abc import (AsyncGenerator, Awaitable, Collection, Generator,
+                             Hashable, Iterable, Iterator, KeysView, Mapping)
 from concurrent.futures.process import ProcessPoolExecutor
 from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
@@ -979,6 +979,53 @@ def get_dtype_size(dtype: torch.dtype) -> int:
     return torch.tensor([], dtype=dtype).element_size()
 
 
+# bool = 0, int = 1, float = 2, complex = 3
+def _get_precision_level(dtype: torch.dtype) -> int:
+    # NOTE: Complex dtypes return `is_floating_point=False`
+    return ((dtype != torch.bool) + dtype.is_floating_point +
+            dtype.is_complex * 2)
+
+
+def is_lossless_cast(src_dtype: torch.dtype, tgt_dtype: torch.dtype):
+    """
+    Test whether it is lossless to cast a tensor from
+    `src_dtype` to `tgt_dtype`.
+    """
+    if src_dtype == tgt_dtype:
+        return True
+
+    src_level = _get_precision_level(src_dtype)
+    tgt_level = _get_precision_level(tgt_dtype)
+
+    if src_level < tgt_level:
+        return True
+    if src_level > tgt_level:
+        return False
+
+    # Compare integral types
+    if not src_dtype.is_floating_point and not src_dtype.is_complex:
+        src_info = torch.iinfo(src_dtype)
+        tgt_info = torch.iinfo(tgt_dtype)
+        return src_info.min >= tgt_info.min and src_info.max <= tgt_info.max
+
+    # Compare floating-point types
+    src_info = torch.finfo(src_dtype)
+    tgt_info = torch.finfo(tgt_dtype)
+    return (src_info.min >= tgt_info.min and src_info.max <= tgt_info.max
+            and src_info.resolution >= tgt_info.resolution)
+
+
+def common_broadcastable_dtype(dtypes: Collection[torch.dtype]):
+    """
+    Get the common `dtype` where all of the other `dtypes` can be
+    cast to it without losing any information.
+    """
+    return max(
+        dtypes,
+        key=lambda dtype: sum(is_lossless_cast(dt, dtype) for dt in dtypes),
+    )
+
+
 # `collections` helpers
 def is_list_of(
     value: object,
-- 
GitLab


From a35ca765a52fff242edf0e9fd3203ea2534aed58 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 1 Jun 2025 11:06:57 +0800
Subject: [PATCH 103/274] [LoRA] Support dynamically initialize
 `packed_modules_mapping` for VLM with arbitrary components (#18987)

Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/lora/models.py                           |  6 +++---
 .../model_loader/bitsandbytes_loader.py       |  8 +++----
 vllm/model_executor/models/intern_vit.py      |  4 ++++
 vllm/model_executor/models/internvl.py        |  9 --------
 vllm/model_executor/models/qwen2_5_vl.py      | 11 ----------
 vllm/model_executor/models/qwen2_vl.py        | 11 ----------
 vllm/model_executor/utils.py                  | 21 +++++++++++++++++++
 7 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index d3b1374a9..dfdc908d7 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import copy
 import math
 import os
 from collections.abc import Sequence
@@ -34,6 +33,7 @@ from vllm.model_executor.models import SupportsLoRA, supports_multimodal
 from vllm.model_executor.models.interfaces import is_pooling_model
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
+from vllm.model_executor.utils import get_packed_modules_mapping
 from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
@@ -364,8 +364,8 @@ class LoRAModelManager(AdapterModelManager):
             # We need to replace rotary emb layer to do batch computation
             # for long lora.
             self.supported_lora_modules.append("rotary_emb")
-        self.packed_modules_mapping = copy.deepcopy(
-            self.model.packed_modules_mapping)
+
+        self.packed_modules_mapping = get_packed_modules_mapping(self.model)
         # Used to indicate whether the model is a multimodal model
         self.supports_mm: bool = (
             supports_multimodal(self.model)
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index 0d83c8d53..8996ea266 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # ruff: noqa: SIM117
-import copy
 import fnmatch
 import glob
 import itertools
@@ -36,7 +35,8 @@ from vllm.model_executor.model_loader.weight_utils import (
     filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
     pt_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.models import is_pooling_model
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.utils import (get_packed_modules_mapping,
+                                       set_weight_attrs)
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
@@ -420,8 +420,8 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                 f"Model {type(model).__name__} does not support BitsAndBytes "
                 "quantization yet. No 'packed_modules_mapping' found.")
         self.is_pool_model=is_pooling_model(model)
-        self.modules_mapping = ParamMapping(
-            copy.deepcopy(model.packed_modules_mapping))
+
+        self.modules_mapping = ParamMapping(get_packed_modules_mapping(model))
 
         # For some models like Molmo, we need to use hf_to_vllm_mapper
         # to ensure correct loading of weights.
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index d9d9002bd..538e9de4f 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -415,6 +415,10 @@ class InternVisionEncoder(nn.Module):
 
 class InternVisionModel(nn.Module):
 
+    packed_modules_mapping = {
+        "qkv": ["qkv"],
+    }
+
     def __init__(
         self,
         config: PretrainedConfig,
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 71be2b48d..c37d3afb4 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -1019,15 +1019,6 @@ class InternVLMultiModalProcessor(
 class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
                         SupportsLoRA):
 
-    packed_modules_mapping = {
-        "wqkv": ["wqkv"],
-        "qkv": ["qkv"],
-        "gate_up_proj": [
-            "w1",
-            "w3",
-        ],
-    }
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
 
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 68dd07820..e3fa9f67c 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -821,17 +821,6 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
     dummy_inputs=Qwen2_5_VLDummyInputsBuilder)
 class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
 
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 0ff0836b0..873baa56f 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1069,17 +1069,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
                                         dummy_inputs=Qwen2VLDummyInputsBuilder)
 class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
 
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index f9d89e64b..1b120c354 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """Utils for model executor."""
+import copy
 from typing import Any, Optional
 
 import torch
@@ -51,3 +52,23 @@ def _make_synced_weight_loader(original_weight_loader):
         torch._sync(param)
 
     return _synced_weight_loader
+
+
+def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
+    parent_map = copy.deepcopy(getattr(model, "packed_modules_mapping", {}))
+
+    # don't infer mapping if the model has defined it explicitly.
+    if parent_map:
+        return parent_map
+
+    # We only check main components instead of whole model submodules
+    for child in model.children():
+        child_map = getattr(child, "packed_modules_mapping", {})
+        if any((k in parent_map and parent_map[k] != v)
+               for k, v in child_map.items()):
+            raise ValueError(
+                f"Can't update {type(model).__name__}'s packed_modules_mapping "
+                f"safely because of conflicts from {type(child).__name__}.")
+        else:
+            parent_map.update(child_map)
+    return parent_map
\ No newline at end of file
-- 
GitLab


From c594cbf56546f74e8042018477ca25d1ee95613a Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sun, 1 Jun 2025 11:23:43 +0800
Subject: [PATCH 104/274] [doc] small fix -  mkdocs (#18996)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/contributing/README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/contributing/README.md b/docs/contributing/README.md
index 72c4909b1..6216a3d16 100644
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -43,11 +43,12 @@ Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blo
 pip install -r requirements/docs.txt
 ```
 
-> **Note:** Ensure that your Python version is compatible with the plugins (e.g., mkdocs-awesome-nav requires Python 3.10+)
+!!! note
+    Ensure that your Python version is compatible with the plugins (e.g., `mkdocs-awesome-nav` requires Python 3.10+)
 
 #### Verify Installation
 
-Confirm that MkDocs is correctly installed::
+Confirm that MkDocs is correctly installed:
 
 ```bash
 mkdocs --version
-- 
GitLab


From 2ad6194a0250932da7354e4d9dbebaf610f90202 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Sat, 31 May 2025 23:41:29 -0400
Subject: [PATCH 105/274] Let max_num_batched_tokens use human_readable_int for
 large numbers (#18968)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 13d8a280e..555532526 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -224,7 +224,7 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
         elif contains_type(type_hints, int):
             kwargs[name]["type"] = int
             # Special case for large integers
-            if name in {"max_model_len"}:
+            if name in {"max_model_len", "max_num_batched_tokens"}:
                 kwargs[name]["type"] = human_readable_int
         elif contains_type(type_hints, float):
             kwargs[name]["type"] = float
-- 
GitLab


From aa54a7bf7bae7e1db43693470ebe93e3dcd30f9d Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Sun, 1 Jun 2025 11:42:10 +0800
Subject: [PATCH 106/274] [BugFix] fix data parallel construct ipv6 url addres
 (#18991)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 vllm/distributed/parallel_state.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index b674d05a7..6e48c02da 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -41,8 +41,8 @@ from vllm.distributed.device_communicators.base_device_communicator import (
     DeviceCommunicatorBase)
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
-from vllm.utils import (direct_register_custom_op, resolve_obj_by_qualname,
-                        supports_custom_op)
+from vllm.utils import (direct_register_custom_op, get_distributed_init_method,
+                        resolve_obj_by_qualname, supports_custom_op)
 
 
 @dataclass
@@ -929,7 +929,7 @@ def init_distributed_environment(
         world_size = parallel_config.world_size_across_dp
         ip = parallel_config.data_parallel_master_ip
         port = parallel_config.get_next_dp_init_port()
-        distributed_init_method = f"tcp://{ip}:{port}"  # noqa
+        distributed_init_method = get_distributed_init_method(ip, port)
         logger.info(
             "Adjusting world_size=%d rank=%d distributed_init_method=%s for DP",
             world_size, rank, distributed_init_method)
-- 
GitLab


From 2b102d51ad357903c502d72c1264fb2f724353f3 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sat, 31 May 2025 20:42:23 -0700
Subject: [PATCH 107/274] [BugFix] Fix incorrect metrics shutdown error log
 message (#18992)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/metrics/prometheus.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py
index f12568535..a364b286d 100644
--- a/vllm/v1/metrics/prometheus.py
+++ b/vllm/v1/metrics/prometheus.py
@@ -69,9 +69,13 @@ def unregister_vllm_metrics():
 
 def shutdown_prometheus():
     """Shutdown prometheus metrics."""
+
+    path = _prometheus_multiproc_dir
+    if path is None:
+        return
     try:
         pid = os.getpid()
-        multiprocess.mark_process_dead(pid)
+        multiprocess.mark_process_dead(pid, path)
         logger.debug("Marked Prometheus metrics for process %d as dead", pid)
     except Exception as e:
         logger.error("Error during metrics cleanup: %s", str(e))
-- 
GitLab


From 432ec9926ebd3ce826f0d49df0a2a5ae3cc81ec0 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sun, 1 Jun 2025 19:26:14 +0800
Subject: [PATCH 108/274] [doc] wrong output (#19000)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/contributing/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/contributing/README.md b/docs/contributing/README.md
index 6216a3d16..f84fb242f 100644
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -57,7 +57,7 @@ mkdocs --version
 Example output:
 
 ```console
-mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.9/site-packages/mkdocs (Python 3.9)
+mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.10/site-packages/mkdocs (Python 3.10)
 ```
 
 #### Clone the `vLLM` repository
-- 
GitLab


From d6fd3a33b863003929fc3eef5dd9828219e04ab0 Mon Sep 17 00:00:00 2001
From: zhrrr <43847754+izhuhaoran@users.noreply.github.com>
Date: Mon, 2 Jun 2025 03:41:18 +0800
Subject: [PATCH 109/274] [Misc] reuse num_tokens_across_dp of get_dp_padding
 to avoid unnecessary dp all reduce in set_forward_context (#18935)

Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
---
 vllm/forward_context.py            | 27 ++++++++++++++-------
 vllm/v1/worker/gpu_model_runner.py | 38 ++++++++++++++++++++++--------
 2 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 0af16bbc0..f192be1c4 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -47,8 +47,12 @@ class DPMetadata:
         return num_tokens_tensor
 
     @staticmethod
-    def make(parallel_config: ParallelConfig, attn_metadata: Any,
-             num_tokens: int) -> "DPMetadata":
+    def make(
+            parallel_config: ParallelConfig,
+            attn_metadata: Any,
+            num_tokens: int,
+            num_tokens_across_dp: Optional[torch.Tensor] = None
+    ) -> "DPMetadata":
 
         assert parallel_config.data_parallel_size > 1
         dp_size = parallel_config.data_parallel_size
@@ -62,10 +66,15 @@ class DPMetadata:
             # for v1 attention backends or no attn_metadata
             batchsize = num_tokens
 
-        num_tokens_tensor = DPMetadata.num_tokens_across_dp(
-            batchsize, dp_size, dp_rank)
-        max_tokens_across_dp_cpu = torch.max(num_tokens_tensor)
-        cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0)
+        # If num_tokens_across_dp is None, it will be computed by all_reduce
+        # Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
+        assert (num_tokens_across_dp is None
+                or num_tokens_across_dp[dp_rank] == batchsize)
+        if num_tokens_across_dp is None:
+            num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
+                batchsize, dp_size, dp_rank)
+        max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp)
+        cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_across_dp, dim=0)
         return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu)
 
 
@@ -101,7 +110,8 @@ def get_forward_context() -> ForwardContext:
 def set_forward_context(attn_metadata: Any,
                         vllm_config: VllmConfig,
                         virtual_engine: int = 0,
-                        num_tokens: Optional[int] = None):
+                        num_tokens: Optional[int] = None,
+                        num_tokens_across_dp: Optional[torch.Tensor] = None):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
@@ -114,7 +124,8 @@ def set_forward_context(attn_metadata: Any,
     if vllm_config.parallel_config.data_parallel_size > 1 and (
             attn_metadata is not None or num_tokens is not None):
         dp_metadata = DPMetadata.make(vllm_config.parallel_config,
-                                      attn_metadata, num_tokens or 0)
+                                      attn_metadata, num_tokens or 0,
+                                      num_tokens_across_dp)
 
     global _forward_context
     prev_context = _forward_context
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b6fa68ab0..4bc825ccb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1111,17 +1111,30 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             for k, v in self.intermediate_tensors.items()
         })
 
-    def get_dp_padding(self, num_tokens: int):
+    def get_dp_padding(self,
+                       num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
         dp_size = self.vllm_config.parallel_config.data_parallel_size
         dp_rank = self.vllm_config.parallel_config.data_parallel_rank
-        if dp_size == 1:
+
+        # For DP: Don't pad when setting enforce_eager.
+        # This lets us set enforce_eager on the prefiller in a P/D setup and
+        # still use CUDA graphs (enabled by this padding) on the decoder.
+        #
+        # TODO(tms) : There are many cases where padding is enabled for
+        # prefills, causing unnecessary and excessive padding of activations.
+
+        if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
             # Early exit.
-            return 0
+            return 0, None
 
         num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
             num_tokens, dp_size, dp_rank)
         max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
-        return max_tokens_across_dp_cpu - num_tokens
+        num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] *
+                                                dp_size,
+                                                device="cpu",
+                                                dtype=torch.int32)
+        return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding
 
     @torch.inference_mode()
     def execute_model(
@@ -1161,7 +1174,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 num_input_tokens = num_scheduled_tokens
 
         # Padding for DP
-        num_input_tokens += self.get_dp_padding(num_input_tokens)
+        num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
+        num_input_tokens += num_pad
 
         # _prepare_inputs may reorder the batch, so we must gather multi
         # modal outputs after that to ensure the correct order
@@ -1208,7 +1222,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # Use persistent buffers for CUDA graphs.
         with set_forward_context(attn_metadata,
                                  self.vllm_config,
-                                 num_tokens=num_input_tokens):
+                                 num_tokens=num_input_tokens,
+                                 num_tokens_across_dp=num_tokens_across_dp):
             self.maybe_setup_kv_connector(scheduler_output)
 
             model_output = self.model(
@@ -1681,7 +1696,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
     ) -> torch.Tensor:
 
         # Padding for DP
-        num_tokens += self.get_dp_padding(num_tokens)
+        num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
+        num_tokens += num_pad
 
         # Set num_scheduled_tokens based on num_tokens and max_num_seqs
         # for dummy run with LoRA so that the num_reqs collectively
@@ -1747,9 +1763,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 intermediate_tensors = self.sync_and_slice_intermediate_tensors(
                     num_tokens, None, False)
 
-            with set_forward_context(attn_metadata,
-                                     self.vllm_config,
-                                     num_tokens=num_tokens):
+            with set_forward_context(
+                    attn_metadata,
+                    self.vllm_config,
+                    num_tokens=num_tokens,
+                    num_tokens_across_dp=num_tokens_across_dp):
                 outputs = model(
                     input_ids=input_ids,
                     positions=positions,
-- 
GitLab


From b9f61e13875e1682d3982829006bec26981fde4d Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Sun, 1 Jun 2025 23:30:41 -0400
Subject: [PATCH 110/274] [Bugfix][Nixl] Fix DP Metadata Handshake (#19008)

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
---
 .../kv_connector/v1/nixl_connector.py         | 68 ++++++++++---------
 1 file changed, 36 insertions(+), 32 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 6a3472157..4d228dbc9 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -19,7 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size,
-    get_tp_group, get_world_group)
+    get_tp_group)
 from vllm.logger import init_logger
 from vllm.utils import make_zmq_path, make_zmq_socket, round_down
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -172,6 +172,11 @@ class NixlConnectorScheduler:
         self.vllm_config = vllm_config
         self.block_size = vllm_config.cache_config.block_size
         self.engine_id = engine_id
+        self.side_channel_host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
+        self.side_channel_port = (
+            envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
+            vllm_config.parallel_config.data_parallel_rank_local *
+            vllm_config.parallel_config.tensor_parallel_size)
         logger.info("Initializing NIXL Scheduler %s", engine_id)
 
         # Requests that need to start recv.
@@ -310,8 +315,8 @@ class NixlConnectorScheduler:
             do_remote_decode=False,
             remote_block_ids=computed_block_ids,
             remote_engine_id=self.engine_id,
-            remote_host=envs.VLLM_NIXL_SIDE_CHANNEL_HOST,
-            remote_port=envs.VLLM_NIXL_SIDE_CHANNEL_PORT,
+            remote_host=self.side_channel_host,
+            remote_port=self.side_channel_port,
         )
 
 
@@ -330,11 +335,19 @@ class NixlConnectorWorker:
         # Map of engine_id -> agent_name.
         self._remote_agents: dict[str, str] = {}
 
+        # NIXL handshake port.
+        # NOTE(rob): Within a DP group, each DP rank gets its own
+        # base port (which is sent in the KVTransferParams).
+        # Each TP rank listens/queries on the base_port + tp_rank.
+        self.side_channel_port = (
+            envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
+            vllm_config.parallel_config.data_parallel_rank_local *
+            vllm_config.parallel_config.tensor_parallel_size)
+
         # Metadata.
         self.engine_id = engine_id
-        self.rank = get_tensor_model_parallel_rank()
+        self.tp_rank = get_tensor_model_parallel_rank()
         self.world_size = get_tensor_model_parallel_world_size()
-        self.world_rank = get_world_group().rank_in_group
         self.tp_group = get_tp_group()
 
         # KV Caches and nixl tracking data.
@@ -383,16 +396,11 @@ class NixlConnectorWorker:
 
     @staticmethod
     def _nixl_handshake_listener(metadata: NixlAgentMetadata,
-                                 ready_event: threading.Event,
-                                 world_rank: int):
+                                 ready_event: threading.Event, base_port: int,
+                                 tp_rank: int):
         """Background thread for getting new NIXL handshakes."""
         # NOTE(rob): this is a simple implementation. We will move
-        # to a better approach like an ETCD server in the future.
-
-        # NOTE(rob): to support heterogeneous TP, we will have to
-        # move this into the scheduler rather than worker, since
-        # each rank needs the metadata of all other ranks (whereas
-        # in this setup, each rank only gets one other rank's meta.
+        # to a better approach via HTTP endpoint soon.
 
         encoder = msgspec.msgpack.Encoder()
         encoded_data = encoder.encode(metadata)
@@ -402,11 +410,7 @@ class NixlConnectorWorker:
 
         # Listen for new requests for metadata.
         host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
-        # NOTE(rob): we need each rank to have a unique port. This
-        # hack to keeps us moving. We will switch when moving to etcd
-        # or where we have a single ZMQ socket in the scheduler.
-        port = envs.VLLM_NIXL_SIDE_CHANNEL_PORT + world_rank
-        path = make_zmq_path("tcp", host, port)
+        path = make_zmq_path("tcp", host, base_port + tp_rank)
         logger.debug("Starting listening on path: %s", path)
         with zmq_ctx(zmq.ROUTER, path) as sock:
             ready_event.set()
@@ -421,10 +425,10 @@ class NixlConnectorWorker:
         """Do a NIXL handshake with a remote instance."""
 
         start_time = time.perf_counter()
-        # NOTE(rob): we need each rank to have a unique port. This is
-        # a hack to keep us moving. We will switch when moving to etcd
-        # or where we have a single ZMQ socket in the scheduler.
-        path = make_zmq_path("tcp", host, port + self.world_rank)
+        # NOTE(rob): we need each tp_rank to have a unique port.
+        # This is a hack to keep us moving. We will switch when
+        # we switch to HTTP-based NIXL metadata exchange.
+        path = make_zmq_path("tcp", host, port + self.tp_rank)
         logger.debug("Querying metadata on path: %s", path)
         with zmq_ctx(zmq.REQ, path) as sock:
             # Send query for the request.
@@ -532,7 +536,7 @@ class NixlConnectorWorker:
         ready_event = threading.Event()
         self._nixl_handshake_listener_t = threading.Thread(
             target=self._nixl_handshake_listener,
-            args=(metadata, ready_event, self.world_rank),
+            args=(metadata, ready_event, self.side_channel_port, self.tp_rank),
             daemon=True,
             name="nixl_handshake_listener")
         self._nixl_handshake_listener_t.start()
@@ -556,9 +560,9 @@ class NixlConnectorWorker:
                 block_offset = block_id * self.block_len
                 # (addr, len, device id)
                 blocks_data.append(
-                    (base_addr + block_offset, self.block_len, self.rank))
-        logger.debug("Created %s blocks for src engine %s and rank %s",
-                     len(blocks_data), self.engine_id, self.rank)
+                    (base_addr + block_offset, self.block_len, self.tp_rank))
+        logger.debug("Created %s blocks for src engine %s and tp_rank %s",
+                     len(blocks_data), self.engine_id, self.tp_rank)
 
         # Register with NIXL.
         descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
@@ -573,9 +577,9 @@ class NixlConnectorWorker:
                 block_offset = block_id * self.block_len
                 # (addr, len, device id)
                 blocks_data.append(
-                    (base_addr + block_offset, self.block_len, self.rank))
-        logger.debug("Created %s blocks for dst engine %s and rank %s",
-                     len(blocks_data), engine_id, self.rank)
+                    (base_addr + block_offset, self.block_len, self.tp_rank))
+        logger.debug("Created %s blocks for dst engine %s and tp_rank %s",
+                     len(blocks_data), engine_id, self.tp_rank)
 
         # Register with NIXL.
         descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
@@ -600,14 +604,14 @@ class NixlConnectorWorker:
         if len(done_sending) > 0 or len(done_recving) > 0:
             logger.debug(
                 "Rank %s, get_finished: %s requests done sending "
-                "and %s requests done recving", self.rank, len(done_sending),
-                len(done_recving))
+                "and %s requests done recving", self.tp_rank,
+                len(done_sending), len(done_recving))
 
         if self.world_size == 1:
             return done_sending, done_recving
 
         # Rank 0: get finished from all other ranks.
-        if self.rank == 0:
+        if self.tp_rank == 0:
             for req_id in done_sending:
                 self._done_sending_count[req_id] += 1
             for req_id in done_recving:
-- 
GitLab


From 9760fd8f6acc752168333a25f5da80a6d2097608 Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Mon, 2 Jun 2025 02:38:50 -0700
Subject: [PATCH 111/274] [Core] Support inplace model weights loading (#18745)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 tests/tensorizer_loader/test_tensorizer.py    |  17 --
 tests/v1/worker/test_gpu_model_runner.py      |  18 ++
 .../model_loader/base_loader.py               |  22 ++-
 .../model_loader/bitsandbytes_loader.py       |  18 +-
 .../model_loader/default_loader.py            |  50 ++----
 .../model_loader/dummy_loader.py              |  23 +--
 .../model_loader/gguf_loader.py               |  10 +-
 .../model_loader/runai_streamer_loader.py     |  30 +---
 .../model_loader/sharded_state_loader.py      |  99 +++++------
 .../model_executor/model_loader/tensorizer.py | 165 +++++++-----------
 .../model_loader/tensorizer_loader.py         |  58 +++---
 vllm/v1/worker/gpu_model_runner.py            |  15 +-
 vllm/v1/worker/tpu_model_runner.py            |  21 ++-
 13 files changed, 249 insertions(+), 297 deletions(-)

diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index b6286e148..747ec56ad 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -4,7 +4,6 @@ import gc
 import os
 import pathlib
 import subprocess
-from unittest.mock import MagicMock, patch
 
 import pytest
 import torch
@@ -16,7 +15,6 @@ from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
                                                          TensorSerializer,
                                                          is_vllm_tensorized,
-                                                         load_with_tensorizer,
                                                          open_stream,
                                                          tensorize_vllm_model)
 # yapf: enable
@@ -61,21 +59,6 @@ def write_keyfile(keyfile_path: str):
         f.write(encryption_params.key)
 
 
-@patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent')
-def test_load_with_tensorizer(mock_agent, tensorizer_config):
-    mock_linear_method = MagicMock()
-    mock_agent_instance = mock_agent.return_value
-    mock_agent_instance.deserialize.return_value = MagicMock()
-
-    result = load_with_tensorizer(tensorizer_config,
-                                  quant_method=mock_linear_method)
-
-    mock_agent.assert_called_once_with(tensorizer_config,
-                                       quant_method=mock_linear_method)
-    mock_agent_instance.deserialize.assert_called_once()
-    assert result == mock_agent_instance.deserialize.return_value
-
-
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_can_deserialize_s3(vllm_runner):
     model_ref = "EleutherAI/pythia-1.4b"
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index c38eb4866..6ba6d1f6f 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -94,6 +94,9 @@ def model_runner():
     return runner
 
 
+model_runner_2 = model_runner
+
+
 def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
     new_reqs = []
     num_scheduled_tokens = {}
@@ -366,3 +369,18 @@ def test_kv_cache_stride_order(monkeypatch, model_runner):
         assert all(kv.is_contiguous() for kv in model_runner.kv_caches)
     else:
         assert all(not kv.is_contiguous() for kv in model_runner.kv_caches)
+
+
+def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
+    # In this test, model_runner loads model + weights in one go, while
+    # model_runner_2 loads dummy weights first then load real weights inplace
+    model_runner.load_model()
+    original_load_format = model_runner_2.load_config.load_format
+    model_runner_2.load_config.load_format = "dummy"
+    model_runner_2.load_model()  # Initial model loading with dummy weights
+    assert str(model_runner.get_model().state_dict()) != str(
+        model_runner_2.get_model().state_dict())
+    model_runner_2.load_config.load_format = original_load_format
+    model_runner_2.load_model()  # Load real weights inplace
+    assert str(model_runner.get_model().state_dict()) == str(
+        model_runner_2.get_model().state_dict())
diff --git a/vllm/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py
index 010dd5157..d619d9f25 100644
--- a/vllm/model_executor/model_loader/base_loader.py
+++ b/vllm/model_executor/model_loader/base_loader.py
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 from abc import ABC, abstractmethod
 
+import torch
 import torch.nn as nn
 
 from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.model_executor.model_loader.utils import (
+    initialize_model, process_weights_after_loading, set_default_torch_dtype)
 
 
 class BaseModelLoader(ABC):
@@ -18,7 +21,22 @@ class BaseModelLoader(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def load_model(self, *, vllm_config: VllmConfig,
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        """Load weights into a model. This standalone API allows 
+        inplace weights loading for an already-initialized model"""
+        raise NotImplementedError
+
+    def load_model(self, vllm_config: VllmConfig,
                    model_config: ModelConfig) -> nn.Module:
         """Load a model with the given configurations."""
-        raise NotImplementedError
+        device_config = vllm_config.device_config
+        target_device = torch.device(device_config.device)
+        with set_default_torch_dtype(model_config.dtype):
+            with target_device:
+                model = initialize_model(vllm_config=vllm_config,
+                                         model_config=model_config)
+            # Quantization does not happen in `load_weights` but after it
+            self.load_weights(model, model_config)
+            process_weights_after_loading(model, model_config, target_device)
+        return model.eval()
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index 8996ea266..3df835a93 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -14,7 +14,7 @@ from huggingface_hub import HfApi
 from torch import nn
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
-from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.config import LoadConfig, ModelConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 # yapf: enable
@@ -28,7 +28,6 @@ from vllm.model_executor.layers.linear import (LinearBase,
                                                RowParallelLinear)
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.utils import (ParamMapping,
-                                                    initialize_model,
                                                     set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
@@ -408,8 +407,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                 ), "vllm currently does not support BNB quantization for"
         f" {type(model).__name__}"
 
-    def _load_weights(self, model_config: ModelConfig,
-                      model: nn.Module) -> None:
+    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
         if not hasattr(model, "load_weights"):
             raise AttributeError(
                 "The required method 'load_weights' is not defined in class"
@@ -568,15 +566,3 @@ class BitsAndBytesModelLoader(BaseModelLoader):
 
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model, model_config.revision)
-
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
-        device_config = vllm_config.device_config
-        with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
-
-                model = initialize_model(vllm_config=vllm_config)
-
-                self._load_weights(model_config, model)
-
-        return model.eval()
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 29a6e0af4..6946627a5 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -12,11 +12,9 @@ from torch import nn
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
 from vllm import envs
-from vllm.config import LoadConfig, LoadFormat, ModelConfig, VllmConfig
+from vllm.config import LoadConfig, LoadFormat, ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
-from vllm.model_executor.model_loader.utils import (
-    initialize_model, process_weights_after_loading, set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
     fastsafetensors_weights_iterator, filter_duplicate_safetensors_files,
@@ -264,32 +262,20 @@ class DefaultModelLoader(BaseModelLoader):
                               fall_back_to_pt=True,
                               allow_patterns_overrides=None)
 
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
-        device_config = vllm_config.device_config
-        target_device = torch.device(device_config.device)
-        with set_default_torch_dtype(model_config.dtype):
-            with target_device:
-                model = initialize_model(vllm_config=vllm_config,
-                                         model_config=model_config)
-
-            weights_to_load = {name for name, _ in model.named_parameters()}
-            loaded_weights = model.load_weights(
-                self.get_all_weights(model_config, model))
-            self.counter_after_loading_weights = time.perf_counter()
-            logger.info(
-                "Loading weights took %.2f seconds",
-                self.counter_after_loading_weights -
-                self.counter_before_loading_weights)
-            # We only enable strict check for non-quantized models
-            # that have loaded weights tracking currently.
-            if model_config.quantization is None and loaded_weights is not None:
-                weights_not_loaded = weights_to_load - loaded_weights
-                if weights_not_loaded:
-                    raise ValueError(
-                        "Following weights were not initialized from "
-                        f"checkpoint: {weights_not_loaded}")
-
-            process_weights_after_loading(model, model_config, target_device)
-
-        return model.eval()
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        weights_to_load = {name for name, _ in model.named_parameters()}
+        loaded_weights = model.load_weights(
+            self.get_all_weights(model_config, model))
+        self.counter_after_loading_weights = time.perf_counter()
+        logger.info(
+            "Loading weights took %.2f seconds",
+            self.counter_after_loading_weights -
+            self.counter_before_loading_weights)
+        # We only enable strict check for non-quantized models
+        # that have loaded weights tracking currently.
+        if model_config.quantization is None and loaded_weights is not None:
+            weights_not_loaded = weights_to_load - loaded_weights
+            if weights_not_loaded:
+                raise ValueError("Following weights were not initialized from "
+                                 f"checkpoint: {weights_not_loaded}")
diff --git a/vllm/model_executor/model_loader/dummy_loader.py b/vllm/model_executor/model_loader/dummy_loader.py
index 0e2f0be1e..64fa2be76 100644
--- a/vllm/model_executor/model_loader/dummy_loader.py
+++ b/vllm/model_executor/model_loader/dummy_loader.py
@@ -1,11 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
-import torch
 import torch.nn as nn
 
-from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.config import LoadConfig, ModelConfig
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
-from vllm.model_executor.model_loader.utils import (
-    initialize_model, process_weights_after_loading, set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     initialize_dummy_weights)
 
@@ -22,16 +19,8 @@ class DummyModelLoader(BaseModelLoader):
     def download_model(self, model_config: ModelConfig) -> None:
         pass  # Nothing to download
 
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
-        device_config = vllm_config.device_config
-        target_device = torch.device(device_config.device)
-        with set_default_torch_dtype(model_config.dtype):
-            with target_device:
-                model = initialize_model(vllm_config=vllm_config)
-            # NOTE(woosuk): For accurate performance evaluation, we assign
-            # random values to the weights.
-            initialize_dummy_weights(model)
-
-            process_weights_after_loading(model, model_config, target_device)
-        return model.eval()
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        # NOTE(woosuk): For accurate performance evaluation, we assign
+        # random values to the weights.
+        initialize_dummy_weights(model)
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index 806004bf9..1eac50422 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -92,6 +92,13 @@ class GGUFModelLoader(BaseModelLoader):
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model)
 
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        local_model_path = self._prepare_weights(model_config.model)
+        gguf_weights_map = self._get_gguf_weights_map(model_config)
+        model.load_weights(
+            self._get_weights_iterator(local_model_path, gguf_weights_map))
+
     def load_model(self, vllm_config: VllmConfig,
                    model_config: ModelConfig) -> nn.Module:
         device_config = vllm_config.device_config
@@ -106,8 +113,7 @@ class GGUFModelLoader(BaseModelLoader):
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
                 model = initialize_model(vllm_config=vllm_config)
-            model.load_weights(
-                self._get_weights_iterator(local_model_path, gguf_weights_map))
+            self.load_weights(model, model_config)
 
             process_weights_after_loading(model, model_config, target_device)
         return model
diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py
index 9f1022c25..a39e26c6d 100644
--- a/vllm/model_executor/model_loader/runai_streamer_loader.py
+++ b/vllm/model_executor/model_loader/runai_streamer_loader.py
@@ -9,10 +9,8 @@ import torch
 from torch import nn
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
-from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.config import LoadConfig, ModelConfig
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
-from vllm.model_executor.model_loader.utils import (
-    initialize_model, process_weights_after_loading, set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
     runai_safetensors_weights_iterator)
@@ -100,21 +98,11 @@ class RunaiModelStreamerLoader(BaseModelLoader):
         """Download model if necessary"""
         self._prepare_weights(model_config.model, model_config.revision)
 
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
-        """Perform streaming of the model to destination"""
-        device_config = vllm_config.device_config
-        target_device = torch.device(device_config.device)
-        with set_default_torch_dtype(model_config.dtype):
-            with target_device:
-                model = initialize_model(vllm_config=vllm_config)
-
-            model_weights = model_config.model
-            if hasattr(model_config, "model_weights"):
-                model_weights = model_config.model_weights
-            model.load_weights(
-                self._get_weights_iterator(model_weights,
-                                           model_config.revision))
-
-            process_weights_after_loading(model, model_config, target_device)
-        return model.eval()
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        """Load weights into a model."""
+        model_weights = model_config.model
+        if hasattr(model_config, "model_weights"):
+            model_weights = model_config.model_weights
+        model.load_weights(
+            self._get_weights_iterator(model_weights, model_config.revision))
diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py
index 78bca89f0..b5a5031bb 100644
--- a/vllm/model_executor/model_loader/sharded_state_loader.py
+++ b/vllm/model_executor/model_loader/sharded_state_loader.py
@@ -9,11 +9,9 @@ from typing import Any, Optional
 import torch
 from torch import nn
 
-from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.config import LoadConfig, ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
-from vllm.model_executor.model_loader.utils import (
-    initialize_model, process_weights_after_loading, set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     download_weights_from_hf, runai_safetensors_weights_iterator)
 from vllm.transformers_utils.s3_utils import glob as s3_glob
@@ -100,11 +98,8 @@ class ShardedStateLoader(BaseModelLoader):
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model, model_config.revision)
 
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
-        device_config = vllm_config.device_config
-        target_device = torch.device(device_config.device)
-
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
         from vllm.distributed import get_tensor_model_parallel_rank
 
         model_weights = model_config.model
@@ -112,53 +107,47 @@ class ShardedStateLoader(BaseModelLoader):
             model_weights = model_config.model_weights
         local_model_path = model_weights
 
-        with set_default_torch_dtype(model_config.dtype):
-            with target_device:
-                model = initialize_model(vllm_config=vllm_config)
-                process_weights_after_loading(model, model_config,
-                                              target_device)
-            rank = get_tensor_model_parallel_rank()
-            pattern = os.path.join(
-                local_model_path,
-                self.pattern.format(rank=rank, part="*"),
-            )
-
-            filepaths = []
-            if is_s3(local_model_path):
-                file_pattern = f"*{self.pattern.format(rank=rank, part=' * ')}"
-                filepaths = s3_glob(path=local_model_path,
-                                    allow_pattern=[file_pattern])
-            else:
-                filepaths = glob.glob(pattern)
-            if not filepaths:
-                # TODO: support un-sharded checkpoints too
-                raise ValueError(
-                    f"Could not find checkpoint files '{pattern}', only "
-                    f"pre-sharded checkpoints are currently supported!")
-            state_dict = self._filter_subtensors(model.state_dict())
-            for key, tensor in self.iterate_over_files(filepaths):
-                # If loading with LoRA enabled, additional padding may
-                # be added to certain parameters. We only load into a
-                # narrowed view of the parameter data.
-                param_data = state_dict[key].data
-                param_shape = state_dict[key].shape
-                for dim, size in enumerate(tensor.shape):
-                    if size < param_shape[dim]:
-                        param_data = param_data.narrow(dim, 0, size)
-                if tensor.shape != param_shape:
-                    logger.warning(
-                        "loading tensor of shape %s into "
-                        "parameter '%s' of shape %s",
-                        tensor.shape,
-                        key,
-                        param_shape,
-                    )
-                param_data.copy_(tensor)
-                state_dict.pop(key)
-            if state_dict:
-                raise ValueError(
-                    f"Missing keys {tuple(state_dict)} in loaded state!")
-        return model.eval()
+        rank = get_tensor_model_parallel_rank()
+        pattern = os.path.join(
+            local_model_path,
+            self.pattern.format(rank=rank, part="*"),
+        )
+
+        filepaths = []
+        if is_s3(local_model_path):
+            file_pattern = f"*{self.pattern.format(rank=rank, part=' * ')}"
+            filepaths = s3_glob(path=local_model_path,
+                                allow_pattern=[file_pattern])
+        else:
+            filepaths = glob.glob(pattern)
+        if not filepaths:
+            # TODO: support un-sharded checkpoints too
+            raise ValueError(
+                f"Could not find checkpoint files '{pattern}', only "
+                f"pre-sharded checkpoints are currently supported!")
+        state_dict = self._filter_subtensors(model.state_dict())
+        for key, tensor in self.iterate_over_files(filepaths):
+            # If loading with LoRA enabled, additional padding may
+            # be added to certain parameters. We only load into a
+            # narrowed view of the parameter data.
+            param_data = state_dict[key].data
+            param_shape = state_dict[key].shape
+            for dim, size in enumerate(tensor.shape):
+                if size < param_shape[dim]:
+                    param_data = param_data.narrow(dim, 0, size)
+            if tensor.shape != param_shape:
+                logger.warning(
+                    "loading tensor of shape %s into "
+                    "parameter '%s' of shape %s",
+                    tensor.shape,
+                    key,
+                    param_shape,
+                )
+            param_data.copy_(tensor)
+            state_dict.pop(key)
+        if state_dict:
+            raise ValueError(
+                f"Missing keys {tuple(state_dict)} in loaded state!")
 
     def iterate_over_files(
             self, paths) -> Generator[tuple[str, torch.Tensor], None, None]:
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 4c4502284..90c0bdf08 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -21,7 +21,8 @@ from torch.utils._python_dispatch import TorchDispatchMode
 from transformers import PretrainedConfig
 
 import vllm.envs as envs
-from vllm.config import ModelConfig, ParallelConfig, set_current_vllm_config
+from vllm.config import (ModelConfig, ParallelConfig, VllmConfig,
+                         set_current_vllm_config)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -208,12 +209,6 @@ class TensorizerConfig:
                            **tensorizer_args.stream_params)
 
 
-def load_with_tensorizer(tensorizer_config: TensorizerConfig,
-                         **extra_kwargs) -> nn.Module:
-    tensorizer = TensorizerAgent(tensorizer_config, **extra_kwargs)
-    return tensorizer.deserialize()
-
-
 @dataclass
 class TensorizerArgs:
     tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, BinaryIO, str,
@@ -366,100 +361,72 @@ class TensorizerArgs:
         return tensorizer_args
 
 
-class TensorizerAgent:
-    """
-    A class for performing tensorizer deserializations specifically for
-    vLLM models using plaid_mode. Uses TensorizerArgs to configure the
-    behavior of the TensorDeserializer when loading tensors from a serialized
-    model. For deserializations of HuggingFace models, TensorDeserializer is
-    instead used as an iterator directly in the func hf_model_weights_iterator
-    in vllm/model_executor/model_loader/weight_utils.py
-    """
-
-    def __init__(self, tensorizer_config: TensorizerConfig, vllm_config):
-        self.tensorizer_config = tensorizer_config
-        self.tensorizer_args = (
-            self.tensorizer_config._construct_tensorizer_args())
-        self.vllm_config = vllm_config
-        self.model = self._init_model()
-
-    def _init_model(self):
-        assert self.tensorizer_config.hf_config is not None
-        model_args = self.tensorizer_config.hf_config
-        model_args.torch_dtype = self.tensorizer_config.dtype
-        assert self.tensorizer_config.model_class is not None
-        # TODO: Do we need to consider old-style model class?
-        with meta_tensor_mode(), set_current_vllm_config(self.vllm_config,
-                                                         check_compile=True):
-            return self.tensorizer_config.model_class(
-                vllm_config=self.vllm_config)
-
-    def _resize_lora_embeddings(self):
-        """Modify LoRA embedding layers to use bigger tensors
-        to allow for adapter added tokens."""
-        for child in self.model.modules():
-            if (isinstance(child, VocabParallelEmbedding)
-                    and child.weight.shape[0]
-                    < child.num_embeddings_per_partition):
-                new_weight = torch.empty(child.num_embeddings_per_partition,
-                                         child.embedding_dim,
-                                         dtype=child.weight.dtype,
-                                         device=child.weight.device)
-                new_weight[:child.weight.shape[0]].copy_(child.weight.data)
-                new_weight[child.weight.shape[0]:].fill_(0)
-                child.weight.data = new_weight
-
-    def _check_tensors_on_meta_device(self):
-        for tensor in self.model.state_dict().values():
-            if tensor.device.type == 'meta':
-                raise ValueError(
-                    "The serialized model contains tensors on the meta device,"
-                    " indicating that some tensors were not loaded properly."
-                    " Please check that the parameters of the model being"
-                    " specified match that of the serialized model, such as"
-                    " its quantization.")
-
-    def deserialize(self):
-        """
-        Deserialize the model using the TensorDeserializer. This method is
-        specifically for vLLM models using tensorizer's plaid_mode.
-
-        The deserializer makes use of tensorizer_args.stream_params
-        to configure the behavior of the stream when loading tensors from a
-        serialized model. The deserializer_params are used to configure the
-        behavior of the TensorDeserializer when loading tensors themselves.
-        Documentation on these params can be found in TensorizerArgs
-
-        Returns:
-            nn.Module: The deserialized model.
-        """
-        before_mem = get_mem_usage()
-        start = time.perf_counter()
-        with _read_stream(
-                self.tensorizer_config.tensorizer_uri,
-                **self.tensorizer_args.stream_params
-        ) as stream, TensorDeserializer(
+def _check_tensors_on_meta_device(model: nn.Module) -> None:
+    for tensor in model.state_dict().values():
+        if tensor.device.type == 'meta':
+            raise ValueError(
+                "The serialized model contains tensors on the meta device,"
+                " indicating that some tensors were not loaded properly."
+                " Please check that the parameters of the model being"
+                " specified match that of the serialized model, such as"
+                " its quantization.")
+
+
+def _resize_lora_embeddings(model: nn.Module):
+    """Modify LoRA embedding layers to use bigger tensors
+    to allow for adapter added tokens."""
+    for child in model.modules():
+        if (isinstance(child, VocabParallelEmbedding) and child.weight.shape[0]
+                < child.num_embeddings_per_partition):
+            new_weight = torch.empty(child.num_embeddings_per_partition,
+                                     child.embedding_dim,
+                                     dtype=child.weight.dtype,
+                                     device=child.weight.device)
+            new_weight[:child.weight.shape[0]].copy_(child.weight.data)
+            new_weight[child.weight.shape[0]:].fill_(0)
+            child.weight.data = new_weight
+
+
+def init_tensorizer_model(tensorizer_config: TensorizerConfig,
+                          vllm_config: VllmConfig) -> nn.Module:
+    assert tensorizer_config.hf_config is not None
+    model_args = tensorizer_config.hf_config
+    model_args.torch_dtype = tensorizer_config.dtype
+    assert tensorizer_config.model_class is not None
+    # TODO: Do we need to consider old-style model class?
+    with meta_tensor_mode(), set_current_vllm_config(vllm_config,
+                                                     check_compile=True):
+        return tensorizer_config.model_class(vllm_config=vllm_config)
+
+
+def deserialize_tensorizer_model(model: nn.Module,
+                                 tensorizer_config: TensorizerConfig) -> None:
+    tensorizer_args = tensorizer_config._construct_tensorizer_args()
+    before_mem = get_mem_usage()
+    start = time.perf_counter()
+    with _read_stream(
+            tensorizer_config.tensorizer_uri,
+            **tensorizer_args.stream_params) as stream, TensorDeserializer(
                 stream,
-                dtype=self.tensorizer_config.dtype,
+                dtype=tensorizer_config.dtype,
                 device=f'cuda:{torch.cuda.current_device()}',
-                **self.tensorizer_args.deserializer_params) as deserializer:
-            deserializer.load_into_module(self.model)
-            end = time.perf_counter()
-
-        total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
-        duration = end - start
-        per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
-        after_mem = get_mem_usage()
-        deserializer.close()
-        logger.info("Deserialized %s in %0.2fs, %s/s", total_bytes_str,
-                    end - start, per_second)
-        logger.info("Memory usage before: %s", before_mem)
-        logger.info("Memory usage after: %s", after_mem)
-
-        self._check_tensors_on_meta_device()
-        self._resize_lora_embeddings()
-        del self.model.vllm_tensorized_marker
-        return self.model.eval()
+                **tensorizer_args.deserializer_params) as deserializer:
+        deserializer.load_into_module(model)
+        end = time.perf_counter()
+
+    total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
+    duration = end - start
+    per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
+    after_mem = get_mem_usage()
+    deserializer.close()
+    logger.info("Deserialized %s in %0.2fs, %s/s", total_bytes_str,
+                end - start, per_second)
+    logger.info("Memory usage before: %s", before_mem)
+    logger.info("Memory usage after: %s", after_mem)
+
+    _check_tensors_on_meta_device(model)
+    _resize_lora_embeddings(model)
+    del model.vllm_tensorized_marker
 
 
 def tensorizer_weights_iterator(
diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py
index 2afe2b59e..1923e040a 100644
--- a/vllm/model_executor/model_loader/tensorizer_loader.py
+++ b/vllm/model_executor/model_loader/tensorizer_loader.py
@@ -11,8 +11,8 @@ from vllm.config import LoadConfig, ModelConfig, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.tensorizer import (
-    TensorizerConfig, is_vllm_tensorized, load_with_tensorizer,
-    serialize_vllm_model, tensorizer_weights_iterator)
+    TensorizerConfig, deserialize_tensorizer_model, init_tensorizer_model,
+    is_vllm_tensorized, serialize_vllm_model, tensorizer_weights_iterator)
 from vllm.model_executor.model_loader.utils import (get_model_architecture,
                                                     initialize_model,
                                                     set_default_torch_dtype)
@@ -61,38 +61,34 @@ class TensorizerLoader(BaseModelLoader):
             model.load_weights(self._get_weights_iterator())
         return model.eval()
 
-    def _load_model_serialized(
-        self,
-        vllm_config: VllmConfig,
-    ) -> nn.Module:
-        """Load a serialized model with tensorizer.
-
-        Expects a vLLM-tensorized model. See the
-        examples/others/tensorize_vllm_model.py example script
-        for serializing vLLM models."""
-
-        device_config = vllm_config.device_config
-        model_config = vllm_config.model_config
-
-        with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
-                model_class = get_model_architecture(model_config)[0]
-
-                tensorizer_config = copy.copy(self.tensorizer_config)
-                tensorizer_config.model_class = model_class
-                tensorizer_config.hf_config = model_config.hf_config
-                tensorizer_config.dtype = model_config.dtype
-
-                model = load_with_tensorizer(tensorizer_config,
-                                             vllm_config=vllm_config)
-        return model.eval()
-
     def download_model(self, model_config: ModelConfig) -> None:
         self.tensorizer_config.verify_with_model_config(model_config)
 
         with self.tensorizer_config.open_stream():
             pass
 
+    def _patch_tensorizer_config(
+            self, model_config: ModelConfig) -> TensorizerConfig:
+        model_class = get_model_architecture(model_config)[0]
+        tensorizer_config = copy.copy(self.tensorizer_config)
+        tensorizer_config.model_class = model_class
+        tensorizer_config.hf_config = model_config.hf_config
+        tensorizer_config.dtype = model_config.dtype
+        return tensorizer_config
+
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        """Load serialized model weights with tensorizer.
+
+        Expects a vLLM-tensorized model. See the
+        examples/others/tensorize_vllm_model.py example script
+        for serializing vLLM models."""
+        if is_vllm_tensorized(self.tensorizer_config):
+            tensorizer_config = self._patch_tensorizer_config(model_config)
+            deserialize_tensorizer_model(model, tensorizer_config)
+        else:
+            model.load_weights(self._get_weights_iterator())
+
     def load_model(self, vllm_config: VllmConfig,
                    model_config: ModelConfig) -> nn.Module:
         parallel_config = vllm_config.parallel_config
@@ -106,7 +102,11 @@ class TensorizerLoader(BaseModelLoader):
                 get_tensor_model_parallel_rank())
 
         if is_vllm_tensorized(self.tensorizer_config):
-            return self._load_model_serialized(vllm_config=vllm_config)
+            tensorizer_config = self._patch_tensorizer_config(model_config)
+            model = init_tensorizer_model(tensorizer_config=tensorizer_config,
+                                          vllm_config=vllm_config)
+            self.load_weights(model, model_config)
+            return model
         return self._load_model_serialized_cpu(vllm_config=vllm_config)
 
     @staticmethod
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4bc825ccb..9f7c474c7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -28,7 +28,7 @@ from vllm.forward_context import (DPMetadata, get_forward_context,
                                   set_forward_context)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
-from vllm.model_executor.model_loader import TensorizerLoader, get_model
+from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.multimodal.utils import group_mm_inputs_by_modality
@@ -1564,7 +1564,18 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         logger.info("Starting to load model %s...", self.model_config.model)
         with DeviceMemoryProfiler() as m:  # noqa: SIM117
             time_before_load = time.perf_counter()
-            self.model = get_model(vllm_config=self.vllm_config)
+            model_loader = get_model_loader(self.load_config)
+            if not hasattr(self, "model"):
+                logger.info("Loading model from scratch...")
+                self.model = model_loader.load_model(
+                    vllm_config=self.vllm_config,
+                    model_config=self.model_config)
+            else:
+                logger.info(
+                    "Model was already initialized. Loading weights inplace..."
+                )
+                model_loader.load_weights(self.model,
+                                          model_config=self.model_config)
             if self.lora_config:
                 self.model = self.load_lora_model(self.model,
                                                   self.model_config,
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index c57ac3138..5de92351e 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -21,7 +21,7 @@ from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.lora.layers import BaseLayerWithLoRA
-from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.model_loader import get_model_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs,
                                     PlaceholderRange)
@@ -171,7 +171,7 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         self.encoder_cache_size = encoder_cache_size
 
         # Lazy initialization
-        # self.model: nn.Module  # Set after load_model
+        self.model: nn.Module  # Set after load_model
         self.kv_caches: list[torch.Tensor] = []
         # req_id -> (input_id -> encoder_output)
         self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
@@ -419,7 +419,6 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
 
     def get_model(self) -> nn.Module:
-        assert self.model is not None
         return self.model
 
     def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
@@ -936,7 +935,18 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                 "vllm.model_executor.layers.vocab_parallel_embedding."
                 "get_tensor_model_parallel_rank",
                 return_value=xm_tp_rank):
-            model = get_model(vllm_config=self.vllm_config)
+            # model = get_model(vllm_config=self.vllm_config)
+            model_loader = get_model_loader(self.load_config)
+            if not hasattr(self, "model"):
+                logger.info("Loading model from scratch...")
+                model = model_loader.load_model(vllm_config=self.vllm_config,
+                                                model_config=self.model_config)
+            else:
+                logger.info(
+                    "Model was already initialized. Loading weights inplace..."
+                )
+                model_loader.load_weights(self.model,
+                                          model_config=self.model_config)
         if self.lora_config is not None:
             model = self.load_lora_model(model, self.model_config,
                                          self.scheduler_config,
@@ -947,7 +957,8 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         # loading.
         xm.mark_step()
         xm.wait_device_ops()
-        self.model = model
+        if not hasattr(self, "model"):
+            self.model = model
         self.sampler = TPUSampler()
 
     @torch.no_grad()
-- 
GitLab


From 5b168b6d7a36877bc57db75cfb9ff6c9d850ced4 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Mon, 2 Jun 2025 19:07:26 +0800
Subject: [PATCH 112/274] [doc] add pytest tips (#19010)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/contributing/README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/contributing/README.md b/docs/contributing/README.md
index f84fb242f..65ae9cc96 100644
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -108,6 +108,9 @@ pre-commit run mypy-3.9 --hook-stage manual --all-files
 
 # Unit tests
 pytest tests/
+
+# Run tests for a single test file with detailed output
+pytest -s -v tests/test_logger.py
 ```
 
 !!! tip
-- 
GitLab


From ebb1ec931871ee1baca23673049865856c44ce4e Mon Sep 17 00:00:00 2001
From: jennyyyyzhen <47012288+jennyyyyzhen@users.noreply.github.com>
Date: Mon, 2 Jun 2025 04:22:54 -0700
Subject: [PATCH 113/274] [Model] enable data parallel for Llama4 vision
 encoder (#18368)

Signed-off-by: yzhen <yzhen@devgpu093.cco2.facebook.com>
Co-authored-by: yZhen <yZhen@fb.com>
Co-authored-by: yzhen <yzhen@devgpu093.cco2.facebook.com>
---
 vllm/config.py                        |   4 +
 vllm/engine/arg_utils.py              |   8 +
 vllm/model_executor/models/mllama4.py | 235 ++++++++++++++++++--------
 vllm/multimodal/utils.py              |  35 ++++
 4 files changed, 214 insertions(+), 68 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index f400e9875..d7bae52cb 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1790,6 +1790,10 @@ class ParallelConfig:
     rank: int = 0
     """Global rank in distributed setup."""
 
+    enable_multimodal_encoder_data_parallel: bool = False
+    """ Use data parallelism instead of tensor parallelism for vision encoder. 
+    Only support LLama4 for now"""
+
     @property
     def world_size_across_dp(self) -> int:
         """world_size_across_dp is TPxPPxDP, it is the size of the world
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 555532526..299c8347f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -423,6 +423,9 @@ class EngineArgs:
     use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
     pt_load_map_location: str = LoadConfig.pt_load_map_location
 
+    enable_multimodal_encoder_data_parallel: bool = \
+        ParallelConfig.enable_multimodal_encoder_data_parallel
+
     def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
@@ -637,6 +640,9 @@ class EngineArgs:
                                     **parallel_kwargs["worker_cls"])
         parallel_group.add_argument("--worker-extension-cls",
                                     **parallel_kwargs["worker_extension_cls"])
+        parallel_group.add_argument(
+            "--enable-multimodal-encoder-data-parallel",
+            **parallel_kwargs["enable_multimodal_encoder_data_parallel"])
 
         # KV cache arguments
         cache_kwargs = get_kwargs(CacheConfig)
@@ -1078,6 +1084,8 @@ class EngineArgs:
             distributed_executor_backend=self.distributed_executor_backend,
             worker_cls=self.worker_cls,
             worker_extension_cls=self.worker_extension_cls,
+            enable_multimodal_encoder_data_parallel=self.
+            enable_multimodal_encoder_data_parallel,
         )
 
         speculative_config = self.create_speculative_config(
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 8c98492c0..58549b10e 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -34,6 +34,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.inputs import InputProcessingContext
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
+                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
@@ -49,6 +50,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.utils import run_dp_sharded_vision_model
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -84,23 +86,29 @@ class Llama4ImagePatchInputs(TypedDict):
 
 class Llama4VisionMLP(nn.Module):
 
-    def __init__(self,
-                 input_size: int,
-                 intermediate_size: int,
-                 output_size: int,
-                 bias: bool,
-                 output_activation: bool,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        input_size: int,
+        intermediate_size: int,
+        output_size: int,
+        bias: bool,
+        output_activation: bool,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
-        self.fc1 = ColumnParallelLinear(
+        cls_fc1 = (ReplicatedLinear
+                   if use_data_parallel else ColumnParallelLinear)
+        self.fc1 = cls_fc1(
             input_size=input_size,
             output_size=intermediate_size,
             bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.fc1",
         )
-        self.fc2 = RowParallelLinear(
+        cls_fc2 = ReplicatedLinear if use_data_parallel else RowParallelLinear
+        self.fc2 = cls_fc2(
             input_size=intermediate_size,
             output_size=output_size,
             bias=bias,
@@ -155,10 +163,12 @@ def pixel_shuffle(input_tensor, shuffle_ratio):
                                         int(channels / shuffle_ratio))
     reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
 
-    reshaped_tensor = reshaped_tensor.view(batch_size,
-                                           int(height * shuffle_ratio),
-                                           int(width * shuffle_ratio),
-                                           int(channels / (shuffle_ratio**2)))
+    reshaped_tensor = reshaped_tensor.view(
+        batch_size,
+        int(height * shuffle_ratio),
+        int(width * shuffle_ratio),
+        int(channels / (shuffle_ratio**2)),
+    )
     reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
 
     output_tensor = reshaped_tensor.view(batch_size, -1,
@@ -173,6 +183,7 @@ class Llama4VisionPixelShuffleMLP(nn.Module):
         config,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         super().__init__()
         self.pixel_shuffle_ratio = config.pixel_shuffle_ratio
@@ -186,7 +197,9 @@ class Llama4VisionPixelShuffleMLP(nn.Module):
             bias=config.multi_modal_projector_bias,
             output_activation=True,
             quant_config=quant_config,
-            prefix=f"{prefix}.mlp")
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
 
     def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor:
         encoded_patches = pixel_shuffle(encoded_patches,
@@ -201,10 +214,12 @@ class Llama4VisionAttention(nn.Module):
         config: Llama4VisionConfig,
         quant_config: Optional[QuantizationConfig],
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         super().__init__()
         self.config = config
-        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_size = (1 if use_data_parallel else
+                        get_tensor_model_parallel_world_size())
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = config.hidden_size // self.num_heads
@@ -217,22 +232,39 @@ class Llama4VisionAttention(nn.Module):
 
         self.attn = MultiHeadAttention(self.num_local_heads, self.head_dim,
                                        self.scaling)
-        self.qkv_proj = QKVParallelLinear(
-            self.embed_dim,
-            self.head_dim,
-            self.num_heads,
-            bias=True,
-            quant_config=quant_config,
-            prefix=f"{prefix}.qkv_proj",
-        )
-        self.o_proj = RowParallelLinear(
-            self.num_heads * self.head_dim,
-            self.embed_dim,
-            bias=True,
-            input_is_parallel=True,
-            quant_config=quant_config,
-            prefix=f"{prefix}.o_proj",
-        )
+
+        if use_data_parallel:
+            self.qkv_proj = ReplicatedLinear(
+                self.embed_dim,
+                self.q_size + 2 * self.kv_size,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+            self.o_proj = ReplicatedLinear(
+                self.num_heads * self.head_dim,
+                self.embed_dim,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.o_proj",
+            )
+        else:
+            self.qkv_proj = QKVParallelLinear(
+                self.embed_dim,
+                self.head_dim,
+                self.num_heads,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+            self.o_proj = RowParallelLinear(
+                self.num_heads * self.head_dim,
+                self.embed_dim,
+                bias=True,
+                input_is_parallel=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.o_proj",
+            )
 
         self.rotary_emb = get_rope(
             head_size=self.head_dim,
@@ -275,22 +307,29 @@ class Llama4VisionEncoderLayer(nn.Module):
         config: Llama4VisionConfig,
         quant_config: Optional[QuantizationConfig],
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_attention_heads = config.num_attention_heads
         self.intermediate_size = config.intermediate_size
 
-        self.self_attn = Llama4VisionAttention(config,
-                                               quant_config=quant_config,
-                                               prefix=f"{prefix}.self_attn")
-        self.mlp = Llama4VisionMLP(input_size=config.hidden_size,
-                                   intermediate_size=config.intermediate_size,
-                                   output_size=config.hidden_size,
-                                   bias=True,
-                                   output_activation=False,
-                                   quant_config=quant_config,
-                                   prefix=f"{prefix}.mlp")
+        self.self_attn = Llama4VisionAttention(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            use_data_parallel=use_data_parallel,
+        )
+        self.mlp = Llama4VisionMLP(
+            input_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=True,
+            output_activation=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
 
         self.input_layernorm = nn.LayerNorm(config.hidden_size)
         self.post_attention_layernorm = nn.LayerNorm(config.hidden_size)
@@ -322,6 +361,7 @@ class Llama4VisionEncoder(nn.Module):
         config: Llama4VisionConfig,
         quant_config: Optional[QuantizationConfig],
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         super().__init__()
         self.config = config
@@ -330,6 +370,7 @@ class Llama4VisionEncoder(nn.Module):
                 config,
                 quant_config=quant_config,
                 prefix=f"{prefix}.layers.{layer_idx}",
+                use_data_parallel=use_data_parallel,
             ) for layer_idx in range(config.num_hidden_layers)
         ])
 
@@ -357,23 +398,33 @@ class Llama4VisionEncoder(nn.Module):
 
 class Llama4UnfoldConvolution(nn.Module):
 
-    def __init__(self,
-                 config: Llama4VisionConfig,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
         kernel_size = config.patch_size
         if isinstance(kernel_size, int):
             kernel_size = (kernel_size, kernel_size)
         self.unfold = torch.nn.Unfold(kernel_size=kernel_size,
                                       stride=config.patch_size)
-        self.linear = ColumnParallelLinear(config.num_channels *
-                                           kernel_size[0] * kernel_size[1],
-                                           config.hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           gather_output=True,
-                                           prefix=f"{prefix}.linear")
+        params = {
+            "input_size":
+            config.num_channels * kernel_size[0] * kernel_size[1],
+            "output_size": config.hidden_size,
+            "bias": False,
+            "quant_config": quant_config,
+            "prefix": f"{prefix}.linear",
+        }
+        if use_data_parallel:
+            cls = ReplicatedLinear
+        else:
+            cls = ColumnParallelLinear
+            params["gather_output"] = True
+        self.linear = cls(**params)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.unfold(hidden_states)
@@ -389,6 +440,7 @@ class Llama4VisionModel(nn.Module):
         config: Llama4VisionConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         super().__init__()
         self.config = config
@@ -403,7 +455,9 @@ class Llama4VisionModel(nn.Module):
         self.patch_embedding = Llama4UnfoldConvolution(
             config,
             quant_config=quant_config,
-            prefix=f"{prefix}.patch_embedding")
+            prefix=f"{prefix}.patch_embedding",
+            use_data_parallel=use_data_parallel,
+        )
 
         self.class_embedding = nn.Parameter(self.scale *
                                             torch.randn(self.hidden_size))
@@ -415,11 +469,18 @@ class Llama4VisionModel(nn.Module):
         self.layernorm_post = nn.LayerNorm(self.hidden_size, eps=1e-5)
 
         # encoders
-        self.model = Llama4VisionEncoder(config,
-                                         quant_config=quant_config,
-                                         prefix=f"{prefix}.model")
+        self.model = Llama4VisionEncoder(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.model",
+            use_data_parallel=use_data_parallel,
+        )
         self.vision_adapter = Llama4VisionPixelShuffleMLP(
-            config, quant_config, prefix=f"{prefix}.vision_adapter")
+            config,
+            quant_config,
+            prefix=f"{prefix}.vision_adapter",
+            use_data_parallel=use_data_parallel,
+        )
 
     def forward(
         self,
@@ -528,8 +589,9 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
         vision_config = self.info.get_hf_config().vision_config
 
         if processed_outputs.get("pixel_values") is not None:
-            assert "images" in mm_data, \
-                "images expected to be in mm_data when pixel_values is present"
+            assert (
+                "images" in mm_data
+            ), "images expected to be in mm_data when pixel_values is present"
 
             images = mm_data["images"]
             parsed_images = (self._get_data_parser().parse_mm_data({
@@ -546,8 +608,8 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
                 get_best_fit(
                     (image.size[1], image.size[0]),
                     torch.tensor(possible_resolutions),
-                    resize_to_max_canvas=image_processor.resize_to_max_canvas)
-                for image in parsed_images
+                    resize_to_max_canvas=image_processor.resize_to_max_canvas,
+                ) for image in parsed_images
             ]
             # TODO tile height/width do not necessarily need to match
             aspect_ratios = [(image_size[0] // tile_size,
@@ -659,13 +721,17 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
+        self.use_data_parallel = (vllm_config.parallel_config.
+                                  enable_multimodal_encoder_data_parallel)
         self.config = config
         self.quant_config = quant_config
         self.multimodal_config = multimodal_config
-        self.vision_model = Llama4VisionModel(config.vision_config,
-                                              None,
-                                              prefix=maybe_prefix(
-                                                  prefix, "vision_model"))
+        self.vision_model = Llama4VisionModel(
+            config.vision_config,
+            None,
+            prefix=maybe_prefix(prefix, "vision_model"),
+            use_data_parallel=self.use_data_parallel,
+        )
         self.multi_modal_projector = Llama4MultiModalProjector(
             self.config,
             None,
@@ -709,7 +775,13 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         flat_data = image_input["flat_data"]
         patches_per_image = image_input["patches_per_image"].tolist()
 
-        vision_embeddings_flat = self.vision_model(flat_data)
+        # shard image input
+        if self.use_data_parallel:
+            vision_embeddings_flat = run_dp_sharded_vision_model(
+                flat_data, self.vision_model)
+        else:
+            vision_embeddings_flat = self.vision_model(flat_data)
+
         vision_embeddings_flat = self.multi_modal_projector(
             vision_embeddings_flat)
 
@@ -796,6 +868,30 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         return get_prefix_weights(), get_other_weights()
 
+    def _consolidate_qkv_weights(
+        self, weights: Iterable[tuple[str, torch.Tensor]]
+    ) -> Iterable[tuple[str, torch.Tensor]]:
+        qkv_idx_mappings = {
+            ".self_attn.q_proj": 0,
+            ".self_attn.k_proj": 1,
+            ".self_attn.v_proj": 2,
+        }
+        qkv_weights = {}
+        for name, loaded_weight in weights:
+            for weight_name, idx in qkv_idx_mappings.items():
+                if weight_name not in name:
+                    continue
+                new_name = name.replace(weight_name, ".self_attn.qkv_proj")
+                if new_name not in qkv_weights:
+                    qkv_weights[new_name] = [None] * 3
+                qkv_weights[new_name][idx] = loaded_weight
+                break
+            else:
+                yield name, loaded_weight
+        for key, weight in qkv_weights.items():
+            qkv_weight = torch.cat(weight, dim=0)
+            yield key, qkv_weight
+
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
 
@@ -818,9 +914,12 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         assert loaded_language_model_params is not None
         updated_params.update(loaded_language_model_params)
 
+        if self.use_data_parallel:
+            other_weights = self._consolidate_qkv_weights(other_weights)
+
         for name, loaded_weight in other_weights:
             for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
+                if weight_name not in name or self.use_data_parallel:
                     continue
                 name = name.replace(weight_name, param_name)
                 param = params_dict[name]
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 9ddba67bf..1d838f66f 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -12,6 +12,9 @@ from PIL import Image
 
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_gather)
 
 from .audio import AudioMediaIO
 from .base import MediaIO
@@ -390,3 +393,35 @@ def group_mm_inputs_by_modality(
     return [
         list(group) for _, group in groupby(mm_inputs, key=modality_group_func)
     ]
+
+
+def run_dp_sharded_vision_model(image_input: torch.Tensor,
+                                vision_model: torch.nn.Module) -> torch.Tensor:
+    """Run a vision model with data parallelism (DP) sharding. The function 
+    will shard the input image tensor on the first dimension and run the vision
+    model
+
+    Args:
+        image_input (torch.Tensor): Image input tensor.
+        vision_model (torch.nn.Module): Vision model.
+
+    Returns:
+        torch.Tensor: Output image embeddings
+    """
+
+    num_chunks = image_input.shape[0]
+    mp_world_size = get_tensor_model_parallel_world_size()
+    num_chunks_per_rank = (num_chunks + mp_world_size - 1) // mp_world_size
+    num_padded_chunks = num_chunks_per_rank * mp_world_size - num_chunks
+    pad = (0, ) * (2 * (image_input.dim() - 1)) + (0, num_padded_chunks)
+    image_input_padded = torch.nn.functional.pad(image_input, pad)
+    rank = get_tensor_model_parallel_rank()
+    image_input_per_rank = image_input_padded[rank *
+                                              num_chunks_per_rank:(rank + 1) *
+                                              num_chunks_per_rank, ...]
+
+    vision_embeddings = vision_model(image_input_per_rank)
+    vision_embeddings = tensor_model_parallel_all_gather(vision_embeddings,
+                                                         dim=0)
+    vision_embeddings = vision_embeddings[:num_chunks, ...]
+    return vision_embeddings
-- 
GitLab


From 20133cfee2e8a21d18be08f90b611cf0424cce96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fr=CE=B1n=C3=A7ois?= <francois.paupier@gmail.com>
Date: Mon, 2 Jun 2025 17:04:23 +0200
Subject: [PATCH 114/274] [Frontend] enable custom logging for the uvicorn
 server (OpenAI API server) (#18403)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: François Paupier <francois.paupier@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 21 +++++++++++++++++++--
 vllm/entrypoints/openai/cli_args.py   |  8 ++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 1e7f88a6a..5a4295ff7 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -5,6 +5,7 @@ import atexit
 import gc
 import importlib
 import inspect
+import json
 import multiprocessing
 import os
 import signal
@@ -16,7 +17,6 @@ from collections.abc import AsyncIterator
 from contextlib import asynccontextmanager
 from functools import partial
 from http import HTTPStatus
-from json import JSONDecodeError
 from typing import Annotated, Any, Optional
 
 import prometheus_client
@@ -930,7 +930,7 @@ async def invocations(raw_request: Request):
     """
     try:
         body = await raw_request.json()
-    except JSONDecodeError as e:
+    except json.JSONDecodeError as e:
         raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
                             detail=f"JSON decode error: {e}") from e
 
@@ -1003,6 +1003,18 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
         return Response(status_code=200, content=response)
 
 
+def load_log_config(log_config_file: Optional[str]) -> Optional[dict]:
+    if not log_config_file:
+        return None
+    try:
+        with open(log_config_file) as f:
+            return json.load(f)
+    except Exception as e:
+        logger.warning("Failed to load log config from file %s: error %s",
+                       log_config_file, e)
+        return None
+
+
 def build_app(args: Namespace) -> FastAPI:
     if args.disable_fastapi_docs:
         app = FastAPI(openapi_url=None,
@@ -1324,6 +1336,11 @@ async def run_server_worker(listen_address,
 
     server_index = client_config.get("client_index", 0) if client_config else 0
 
+    # Load logging config for uvicorn if specified
+    log_config = load_log_config(args.log_config_file)
+    if log_config is not None:
+        uvicorn_kwargs['log_config'] = log_config
+
     async with build_async_engine_client(args, client_config) as engine_client:
         app = build_app(args)
 
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index d01af5e42..f196ff6ed 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -11,6 +11,7 @@ import ssl
 from collections.abc import Sequence
 from typing import Optional, Union, get_args
 
+import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                          validate_chat_template)
@@ -243,6 +244,13 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         " into OpenAI API format, the name register in this plugin can be used "
         "in ``--tool-call-parser``.")
 
+    parser.add_argument(
+        "--log-config-file",
+        type=str,
+        default=envs.VLLM_LOGGING_CONFIG_PATH,
+        help="Path to logging config JSON file for both vllm and uvicorn",
+    )
+
     parser = AsyncEngineArgs.add_cli_args(parser)
 
     parser.add_argument('--max-log-len',
-- 
GitLab


From ca2f6b9c301df6dbe2e5c83c705051f478140695 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Mon, 2 Jun 2025 11:15:53 -0400
Subject: [PATCH 115/274] [Bugfix][Model] Attempt to fix eagle in V0. (#18978)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 vllm/config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index d7bae52cb..d0891d670 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2595,7 +2595,8 @@ class SpeculativeConfig:
                     else:
                         eagle_config = EAGLEConfig(
                             self.draft_model_config.hf_config,
-                            method=self.method)
+                            method=self.method,
+                            model_type="eagle")
                         self.draft_model_config.hf_config = eagle_config
 
                 if (self.num_speculative_tokens is not None
-- 
GitLab


From c57d577e8dc41f270a3ce0d604f5d8ac51b08ed7 Mon Sep 17 00:00:00 2001
From: Calvin Chen <45745657+calvin0327@users.noreply.github.com>
Date: Tue, 3 Jun 2025 03:38:23 +0800
Subject: [PATCH 116/274] add an absolute path for run.sh (#18258)

Signed-off-by: calvin chen <120380290@qq.com>
---
 .../disaggregated-prefill-v1/run.sh                  | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/examples/offline_inference/disaggregated-prefill-v1/run.sh b/examples/offline_inference/disaggregated-prefill-v1/run.sh
index 0ebf45a15..c1dcc95a2 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/run.sh
+++ b/examples/offline_inference/disaggregated-prefill-v1/run.sh
@@ -1,5 +1,11 @@
 rm -rf local_storage/
-rm output.txt
 
-VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py
-VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py
+if [ -f "output.txt" ]; then
+    rm output.txt
+fi
+
+# The directory of current script
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/prefill_example.py"
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/decode_example.py"
-- 
GitLab


From 9112b443a042d8d815880b8780633882ad32b183 Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Mon, 2 Jun 2025 17:06:20 -0700
Subject: [PATCH 117/274] [Hardware][TPU] Initial support of model parallelism
 with single worker using SPMD (#18011)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Co-authored-by: Hossein Sarshar <hossein.sarshar@gmail.com>
Co-authored-by: Chengji Yao <chengjiyao@google.com>
---
 .../scripts/hardware_ci/run-tpu-v1-test.sh    |   4 +
 examples/offline_inference/tpu.py             |  29 ++-
 .../v1/tpu/test_spmd_model_weight_loading.py  |  67 +++++++
 tests/v1/tpu/test_tpu_qkv_linear.py           |  89 +++++++++
 vllm/config.py                                |   2 +
 vllm/distributed/tpu_distributed_utils.py     | 177 ++++++++++++++++++
 vllm/envs.py                                  |   5 +
 vllm/model_executor/model_loader/tpu.py       | 112 +++++++++++
 vllm/model_executor/utils.py                  |   4 +-
 vllm/v1/worker/tpu_model_runner.py            | 101 ++++++----
 vllm/v1/worker/tpu_worker.py                  |  87 +++++----
 11 files changed, 605 insertions(+), 72 deletions(-)
 create mode 100644 tests/v1/tpu/test_spmd_model_weight_loading.py
 create mode 100644 tests/v1/tpu/test_tpu_qkv_linear.py
 create mode 100644 vllm/distributed/tpu_distributed_utils.py
 create mode 100644 vllm/model_executor/model_loader/tpu.py

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 610243145..3212b660e 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -155,6 +155,10 @@ run_and_track_test 12 "test_moe_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 run_and_track_test 13 "test_lora.py" \
     "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
+run_and_track_test 14 "test_tpu_qkv_linear.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
+run_and_track_test 15 "test_spmd_model_weight_loading.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
 
 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then
diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
index e4a75b3f9..f3c2859d4 100644
--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -1,5 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import argparse
+import os
+
 from vllm import LLM, SamplingParams
 
 prompts = [
@@ -18,14 +21,28 @@ sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
 
 
 def main():
+    parser = argparse.ArgumentParser(description="TPU offline inference example")
+    parser.add_argument("--use-spmd", action="store_true", help="Enable SPMD mode")
+    args = parser.parse_args()
+
+    llm_args = {
+        "model": "Qwen/Qwen2-1.5B-Instruct",
+        "max_num_batched_tokens": 64,
+        "max_num_seqs": 4,
+        "max_model_len": 128,
+    }
+    if args.use_spmd:
+        os.environ["VLLM_XLA_USE_SPMD"] = "1"
+        # Can only hardcode the number of chips for now.
+        # calling xr.global_runtime_device_count() beforeing init SPMD env in
+        # torch_xla will mess up the distributed env.
+        llm_args["tensor_parallel_size"] = 8
+        # Use Llama, for num_kv_heads = 8.
+        llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct"
+
     # Set `enforce_eager=True` to avoid ahead-of-time compilation.
     # In real workloads, `enforace_eager` should be `False`.
-    llm = LLM(
-        model="Qwen/Qwen2-1.5B-Instruct",
-        max_num_batched_tokens=64,
-        max_num_seqs=4,
-        max_model_len=128,
-    )
+    llm = LLM(**llm_args)
     outputs = llm.generate(prompts, sampling_params)
     print("-" * 50)
     for output, answer in zip(outputs, answers):
diff --git a/tests/v1/tpu/test_spmd_model_weight_loading.py b/tests/v1/tpu/test_spmd_model_weight_loading.py
new file mode 100644
index 000000000..d36edfc3f
--- /dev/null
+++ b/tests/v1/tpu/test_spmd_model_weight_loading.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+import gc
+import tempfile
+
+import numpy as np
+import pytest
+import torch_xla.distributed.spmd as xs
+import torch_xla.runtime as xr
+
+from vllm.config import set_current_vllm_config
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             init_distributed_environment)
+from vllm.engine.arg_utils import EngineArgs
+from vllm.model_executor.model_loader.tpu import TPUModelLoader
+
+
+def _setup_environment(model):
+    engine_args = EngineArgs(model=model, )
+    vllm_config = engine_args.create_engine_config()
+    with set_current_vllm_config(vllm_config):
+        temp_file = tempfile.mkstemp()[1]
+        init_distributed_environment(
+            1,
+            0,
+            local_rank=0,
+            distributed_init_method=f"file://{temp_file}",
+            backend="gloo")
+        # Under single worker mode, full model is init first and then
+        # partitioned using GSPMD.
+        ensure_model_parallel_initialized(1, 1)
+    return vllm_config
+
+
+MESH = None
+
+
+def _get_spmd_mesh():
+    global MESH
+    if MESH is None:
+        xr.use_spmd()
+        num_devices = xr.global_runtime_device_count()
+        mesh_shape = (num_devices, 1)
+        device_ids = np.array(range(num_devices))
+        MESH = xs.Mesh(device_ids, mesh_shape, ('x', 'y'))
+    return MESH
+
+
+@pytest.mark.parametrize("model", [
+    "Qwen/Qwen2-1.5B-Instruct",
+    "meta-llama/Llama-3.1-8B-Instruct",
+    "meta-llama/Llama-3.1-70B-Instruct",
+])
+def test_tpu_model_loader(model):
+    # Skip the 70B test if there are less than 8 chips
+    # TODO: Query using torch xla API, the query API is not working
+    # with SPMD now. However, This test is running under SPMD mode.
+    if '70B' in model and xr.global_runtime_device_count() < 8:
+        pytest.skip(
+            "Skipping 70B model if the TPU VM has less than 8 chips to \
+                     avoid OOM.")
+
+    vllm_config = _setup_environment(model)
+    loader = TPUModelLoader(load_config=vllm_config.load_config)
+    mesh = _get_spmd_mesh()
+    model = loader.load_model(vllm_config, vllm_config.model_config, mesh)
+    del model
+    gc.collect()
diff --git a/tests/v1/tpu/test_tpu_qkv_linear.py b/tests/v1/tpu/test_tpu_qkv_linear.py
new file mode 100644
index 000000000..b98570f01
--- /dev/null
+++ b/tests/v1/tpu/test_tpu_qkv_linear.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+import tempfile
+
+import numpy as np
+import pytest
+import torch
+import torch_xla.distributed.spmd as xs
+import torch_xla.runtime as xr
+
+from vllm.config import set_current_vllm_config
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             init_distributed_environment)
+from vllm.distributed.tpu_distributed_utils import XlaQKVParallelLinear
+from vllm.engine.arg_utils import EngineArgs
+from vllm.model_executor.layers.linear import QKVParallelLinear
+
+
+@pytest.fixture(autouse=True)
+def setup_environment():
+    # This is a fake config used for init dist env.
+    # QKVParallelLinear needs dist env to be initialized.
+    engine_args = EngineArgs(
+        model="Qwen/Qwen2-1.5B-Instruct",
+        max_model_len=64,
+        max_num_batched_tokens=64,
+        max_num_seqs=4,
+    )
+
+    vllm_config = engine_args.create_engine_config()
+
+    with set_current_vllm_config(vllm_config):
+        temp_file = tempfile.mkstemp()[1]
+        init_distributed_environment(
+            1,
+            0,
+            local_rank=0,
+            distributed_init_method=f"file://{temp_file}",
+            backend="gloo")
+        ensure_model_parallel_initialized(1, 1)
+        yield
+
+
+MESH = None
+
+
+def _get_spmd_mesh():
+    global MESH
+    if MESH is None:
+        xr.use_spmd()
+        num_devices = xr.global_runtime_device_count()
+        mesh_shape = (num_devices, 1)
+        device_ids = np.array(range(num_devices))
+        MESH = xs.Mesh(device_ids, mesh_shape, ('x', 'y'))
+    return MESH
+
+
+@pytest.mark.parametrize("bias", [False, True])
+# `xr.use_spmd()` will set a global state, and this state is not reversible.
+# Therefore, non-SPMD tests should be run before SPMD tests.
+@pytest.mark.parametrize("mesh", [None, _get_spmd_mesh()])
+@pytest.mark.parametrize("device", ['cpu', 'xla'])
+@torch.no_grad()
+def test_xla_qkv_linear(bias, mesh, device):
+    torch.manual_seed(123)
+
+    qkv_linear = QKVParallelLinear(
+        hidden_size=4096,
+        head_size=128,
+        total_num_heads=32,
+        total_num_kv_heads=8,
+        bias=bias,
+        params_dtype=torch.bfloat16,
+        return_bias=False,
+    )
+
+    qkv_linear.weight.data = torch.rand_like(qkv_linear.weight.data) / 10
+    if bias:
+        qkv_linear.bias.data = torch.rand_like(qkv_linear.bias.data)
+
+    xla_qkv_linear = XlaQKVParallelLinear(qkv_linear, mesh=mesh)
+
+    qkv_linear = qkv_linear.to(device)
+    xla_qkv_linear = xla_qkv_linear.to(device)
+    input_tensor = torch.rand(10, 4096, dtype=torch.bfloat16) / 10
+    input_tensor = input_tensor.to(device)
+
+    output = qkv_linear(input_tensor)
+    xla_output = xla_qkv_linear(input_tensor)
+    assert torch.allclose(output.cpu(), xla_output.cpu())
diff --git a/vllm/config.py b/vllm/config.py
index d0891d670..1bd53e35b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1901,6 +1901,8 @@ class ParallelConfig:
             if current_platform.is_neuron():
                 # neuron uses single process to control multiple devices
                 backend = "uni"
+            elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
+                backend = "uni"
             elif (current_platform.is_cuda()
                   and cuda_device_count_stateless() < self.world_size):
                 if not ray_found:
diff --git a/vllm/distributed/tpu_distributed_utils.py b/vllm/distributed/tpu_distributed_utils.py
new file mode 100644
index 000000000..36ab2eb3a
--- /dev/null
+++ b/vllm/distributed/tpu_distributed_utils.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+from collections import OrderedDict
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_xla.distributed.spmd as xs
+from torch.nn.parameter import Parameter
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+
+logger = init_logger(__name__)
+
+
+class XlaQKVParallelLinear(nn.Module):
+
+    def __init__(self,
+                 qkv_linear: nn.Module,
+                 mesh: Optional["xs.Mesh"] = None):
+        super().__init__()
+        assert isinstance(qkv_linear, QKVParallelLinear)
+        self.skip_bias_add = qkv_linear.skip_bias_add
+        self.return_bias = qkv_linear.return_bias
+        assert qkv_linear.tp_size == 1, "TP > 1 is only supported under SPMD."
+
+        self.q_weight: Parameter
+        self.k_weight: Parameter
+        self.v_weight: Parameter
+        self.q_bias: Optional[Parameter]
+        self.k_bias: Optional[Parameter]
+        self.v_bias: Optional[Parameter]
+        self._load_weights_from_qkv_linear(qkv_linear)
+        if mesh is not None:
+            self._shard_weight(mesh)
+
+    def _shard_weight(self, mesh: "xs.Mesh"):
+        self.q_weight = Parameter(self.q_weight.to('xla'), requires_grad=False)
+        self.k_weight = Parameter(self.k_weight.to('xla'), requires_grad=False)
+        self.v_weight = Parameter(self.v_weight.to('xla'), requires_grad=False)
+        xs.mark_sharding(self.q_weight, mesh, ('x', None))
+        xs.mark_sharding(self.k_weight, mesh, ('x', None))
+        xs.mark_sharding(self.v_weight, mesh, ('x', None))
+        if self.q_bias is not None:
+            assert self.k_bias is not None and self.v_bias is not None, \
+                "QKVParallelLinear should have q, k, and v biases together."
+            self.q_bias = Parameter(self.q_bias.to('xla'), requires_grad=False)
+            xs.mark_sharding(self.q_bias, mesh, ('x', ))
+            self.k_bias = Parameter(self.k_bias.to('xla'), requires_grad=False)
+            xs.mark_sharding(self.k_bias, mesh, ('x', ))
+            self.v_bias = Parameter(self.v_bias.to('xla'), requires_grad=False)
+            xs.mark_sharding(self.v_bias, mesh, ('x', ))
+
+    def _load_weights_from_qkv_linear(self, qkv_linear: nn.Module):
+        q_proj_size, k_proj_size, _ = qkv_linear.output_sizes
+        # The weight of qkv linear is a concatenation of q, k, and v weights
+        # along the output dimension.
+        qkv_weight = qkv_linear.weight.data.cpu()
+        q_weight = Parameter(qkv_weight[:q_proj_size], requires_grad=False)
+        k_weight = Parameter(qkv_weight[q_proj_size:q_proj_size + k_proj_size],
+                             requires_grad=False)
+        v_weight = Parameter(qkv_weight[q_proj_size + k_proj_size:],
+                             requires_grad=False)
+        self.register_parameter("q_weight", q_weight)
+        self.register_parameter("k_weight", k_weight)
+        self.register_parameter("v_weight", v_weight)
+
+        if qkv_linear.bias is not None:
+            q_bias = Parameter(qkv_linear.bias[:q_proj_size],
+                               requires_grad=False)
+            k_bias = Parameter(qkv_linear.bias[q_proj_size:q_proj_size +
+                                               k_proj_size],
+                               requires_grad=False)
+            v_bias = Parameter(qkv_linear.bias[q_proj_size + k_proj_size:],
+                               requires_grad=False)
+            self.register_parameter("q_bias", q_bias)
+            self.register_parameter("k_bias", k_bias)
+            self.register_parameter("v_bias", v_bias)
+        else:
+            self.register_parameter("q_bias", None)
+            self.register_parameter("k_bias", None)
+            self.register_parameter("v_bias", None)
+
+    def forward(self, input):
+        # Same forward functionality as QKVParallelLinear, but doing qkv porj
+        # separately.
+        q_bias = self.q_bias if not self.skip_bias_add else None
+        k_bias = self.k_bias if not self.skip_bias_add else None
+        v_bias = self.v_bias if not self.skip_bias_add else None
+        q_proj = F.linear(input, self.q_weight, q_bias)
+        k_proj = F.linear(input, self.k_weight, k_bias)
+        v_proj = F.linear(input, self.v_weight, v_bias)
+        # The q/k/v projections will be split outside of the QKVParallelLinear.
+        # Because we are replacing XlaQKVParallelLinear with the
+        # QKVParallelLinear, we need to concatenate q, k, and v projections to
+        # match the output shape of the QKVParallelLinear implementation even if
+        # it seems to be redundant.
+        # The concat and the following split will be noop, and should be
+        # optimized away by the compiler.
+        qkv_proj = torch.cat([q_proj, k_proj, v_proj], dim=-1)
+        output_bias = torch.cat([q_bias, k_bias, v_bias], dim=-1) if \
+                            self.skip_bias_add else None
+        if not self.return_bias:
+            return qkv_proj
+        return qkv_proj, output_bias
+
+
+def partition_column_parallel_linear(layer: torch.nn.Module,
+                                     mesh: xs.Mesh) -> torch.nn.Module:
+    assert isinstance(layer, ColumnParallelLinear)
+    xs.mark_sharding(layer.weight, mesh, ('x', None))
+    logger.debug("Applied column-parallel sharding to %s", layer)
+    return layer
+
+
+def partition_row_parallel_linear(layer: torch.nn.Module,
+                                  mesh: xs.Mesh) -> torch.nn.Module:
+    assert isinstance(layer, RowParallelLinear)
+    xs.mark_sharding(layer.weight, mesh, (None, 'x'))
+    logger.debug("Applied row-parallel sharding to %s", layer)
+    return layer
+
+
+def partition_qkv_parallel_linear(layer: torch.nn.Module,
+                                  mesh: xs.Mesh) -> torch.nn.Module:
+    assert isinstance(layer, QKVParallelLinear)
+    xla_layer = XlaQKVParallelLinear(layer, mesh)
+    logger.debug("Applied qkv parallel sharding to %s", layer)
+    return xla_layer
+
+
+MODULE_TYPE_TO_WRAPPING_FUNC = OrderedDict([
+    ("QKVParallelLinear", partition_qkv_parallel_linear),
+    ("ColumnParallelLinear", partition_column_parallel_linear),
+    ("RowParallelLinear", partition_row_parallel_linear),
+])
+
+
+def get_fqn(module):
+    # Get the fully qualified name of the module
+    return module.__class__.__qualname__
+
+
+def shard_model(model: torch.nn.Module, mesh: "xs.Mesh") -> None:
+    """
+    Recursively check a PyTorch model and apply appropriate sharding based on 
+    the MODULE_TYPE_TO_WRAPPING_FUNC mapping.
+    
+    Args:
+        model: torch.nn.Module to process
+        mesh: An XLA SPMD mesh object used for sharding
+    """
+
+    def _process_module(module, name=None, parent=None):
+        for module_type, wrapping_func in MODULE_TYPE_TO_WRAPPING_FUNC.items():
+            if get_fqn(module) == module_type:
+                wrapped_module = wrapping_func(module, mesh)
+
+                assert parent is not None and name is not None, (
+                    "Top Level module is not expected to be wrapped.")
+                if wrapped_module is not module:
+                    # Wrapped module and module are different py object.
+                    # The original module should be replaced by the
+                    # wrapped_module.
+                    logger.debug("replace %s with %s", module, wrapped_module)
+                    setattr(parent, name, wrapped_module)
+
+                module = wrapped_module
+                break
+
+        for child_name, child_module in list(module.named_children()):
+            _process_module(child_module, child_name, module)
+
+    _process_module(model)
diff --git a/vllm/envs.py b/vllm/envs.py
index 44baf5a18..3dd0d9045 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -51,6 +51,7 @@ if TYPE_CHECKING:
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
     VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
+    VLLM_XLA_USE_SPMD: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
@@ -513,6 +514,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # If set, assert on XLA recompilation after each execution step.
     "VLLM_XLA_CHECK_RECOMPILATION":
     lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))),
+
+    # Enable SPMD mode for TPU backend.
+    "VLLM_XLA_USE_SPMD":
+    lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))),
     "VLLM_FUSED_MOE_CHUNK_SIZE":
     lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
 
diff --git a/vllm/model_executor/model_loader/tpu.py b/vllm/model_executor/model_loader/tpu.py
new file mode 100644
index 000000000..6197bcdba
--- /dev/null
+++ b/vllm/model_executor/model_loader/tpu.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+import time
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch_xla.core.xla_model as xm
+import torch_xla.distributed.spmd as xs
+
+from vllm.config import ModelConfig, VllmConfig
+from vllm.distributed.tpu_distributed_utils import get_fqn, shard_model
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
+from vllm.model_executor.model_loader.utils import (
+    initialize_model, process_weights_after_loading, set_default_torch_dtype)
+
+logger = init_logger(__name__)
+
+
+class TPUModelLoader(DefaultModelLoader):
+    """
+    A TPU model loader for model loading under SPMD mode.
+    """
+
+    def load_model(
+        self,
+        vllm_config: VllmConfig,
+        model_config: ModelConfig,
+        mesh: Optional[xs.Mesh] = None,
+    ) -> nn.Module:
+        # Initialize model and load weights on CPU. Then, during SPMD partition,
+        # weights are sharded and transferred to TPUs.
+        self.counter_before_loading_weights = time.perf_counter()
+        model_config = vllm_config.model_config
+        assert model_config.quantization is None, "Quantization not supported"
+        target_device = torch.device('cpu')
+        with set_default_torch_dtype(model_config.dtype):
+            with target_device:
+                model = initialize_model(vllm_config=vllm_config)
+
+            load_format = vllm_config.load_config.load_format
+            if load_format != "dummy":
+                weights_to_load = {
+                    name
+                    for name, _ in model.named_parameters()
+                }
+                all_weights = self.get_all_weights(model_config, model)
+                loaded_weights = model.load_weights(all_weights)
+                self.counter_after_loading_weights = time.perf_counter()
+                logger.info(
+                    "Loading weights took %.2f seconds",
+                    self.counter_after_loading_weights -
+                    self.counter_before_loading_weights)
+                # We only enable strict check for non-quantized models
+                # that have loaded weights tracking currently.
+                if model_config.quantization is None and \
+                    loaded_weights is not None:
+                    weights_not_loaded = weights_to_load - loaded_weights
+                    if weights_not_loaded:
+                        raise ValueError(
+                            "Following weights were not initialized from "
+                            f"checkpoint: {weights_not_loaded}")
+            else:
+                logger.info("Use dummy weight during weight loading.")
+
+            process_weights_after_loading(model, model_config, target_device)
+
+        counter_before_partition = time.perf_counter()
+        model = model.eval()
+        model = model.to('xla')
+        shard_model(model, mesh)
+        counter_after_partition = time.perf_counter()
+        logger.info("Partition model took %.2f seconds",
+                    counter_after_partition - counter_before_partition)
+
+        # Ensure the model is properly loaded.
+        self._check_model_is_loaded(mesh, model)
+
+        # Need to torch compile after model sharding are done. Because the
+        # compiler hints ('xs.mark_sharding') are torch ops.
+        if not model_config.is_multimodal_model:
+            model.model = torch.compile(model.model, backend="openxla")
+        else:
+            model.language_model.model = \
+                torch.compile(model.language_model.model, backend="openxla")
+        return model
+
+    def _check_model_is_loaded(self, mesh: Optional[xs.Mesh],
+                               model: nn.Module) -> None:
+        """
+        Ensure the model is properly loaded.
+        1. All model parameters and buffers are on XLA device.
+        2. Non-SPMD friendly layers are replaced as expected.
+        """
+        device = xm.xla_device()
+        device_type = str(device.type)
+
+        # Check parameters
+        for name, param in model.named_parameters():
+            assert param.device.type == device_type, f"Parameter {name} is on \
+                {param.device.type} instead of {device_type}"
+
+        # Check buffers
+        for name, buffer in model.named_buffers():
+            assert buffer.device.type == device_type, \
+                f"Buffer {name} is on {buffer.device.type} instead of \
+                    {device_type}"
+
+        for module in model.modules():
+            if (mesh is not None) and (get_fqn(module) == 'QKVParallelLinear'):
+                raise AssertionError("QKVParallelLinear should be replaced by \
+                            XlaQKVParallelLinear under SPMD mode.")
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 1b120c354..27cea6521 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -49,7 +49,9 @@ def _make_synced_weight_loader(original_weight_loader):
 
     def _synced_weight_loader(param, *args, **kwargs):
         original_weight_loader(param, *args, **kwargs)
-        torch._sync(param)
+        # torch._sync doesn't support, is not needed for CPU tensors.
+        if param.device != torch.device("cpu"):
+            torch._sync(param)
 
     return _synced_weight_loader
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 5de92351e..c5171b973 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -7,21 +7,22 @@ from unittest.mock import patch
 
 import numpy as np
 import torch
-import torch.distributed
 import torch.nn as nn
 # TPU XLA related
 import torch_xla.core.xla_model as xm
+import torch_xla.distributed.spmd as xs
 import torch_xla.runtime as xr
 
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.config import ParallelConfig, VllmConfig, get_layers_from_vllm_config
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.lora.layers import BaseLayerWithLoRA
 from vllm.model_executor.model_loader import get_model_loader
+from vllm.model_executor.model_loader.tpu import TPUModelLoader
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs,
                                     PlaceholderRange)
@@ -98,6 +99,7 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         self,
         vllm_config: VllmConfig,
         device: torch.device,
+        original_parallel_config: Optional[ParallelConfig] = None,
     ):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
@@ -105,6 +107,7 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         self.lora_config = vllm_config.lora_config
         self.load_config = vllm_config.load_config
         self.parallel_config = vllm_config.parallel_config
+        self.original_parallel_config = original_parallel_config
         self.scheduler_config = vllm_config.scheduler_config
         self.speculative_config = vllm_config.speculative_config
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
@@ -118,6 +121,14 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         self.device = device
         self.check_recompilation = envs.VLLM_XLA_CHECK_RECOMPILATION
 
+        # SPMD Related
+        self.use_spmd = envs.VLLM_XLA_USE_SPMD
+        if self.use_spmd:
+            num_devices = xr.global_runtime_device_count()
+            mesh_shape = (num_devices, 1)
+            device_ids = np.array(range(num_devices))
+            self.mesh = xs.Mesh(device_ids, mesh_shape, ('x', 'y'))
+
         self.enforce_eager = model_config.enforce_eager
 
         self.num_xla_graphs = 0
@@ -271,6 +282,15 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                                        max_num_mm_items_decoder_budget)
                 self.max_num_mm_items_by_modality[modality] = max_num_mm_items
 
+        if not self.use_spmd:
+            self.sample_from_logits_func = torch.compile(
+                self.sample_from_logits,
+                backend="openxla",
+                fullgraph=True,
+                dynamic=False)
+        else:
+            self.sample_from_logits_func = self.sample_from_logits
+
     def _update_num_xla_graphs(self, case_str):
         check_comp = self.check_recompilation and not self.enforce_eager
         if not check_comp:
@@ -825,9 +845,8 @@ class TPUModelRunner(LoRAModelRunnerMixin):
             logits = self.structured_decode(require_struct_decoding,
                                             grammar_bitmask_padded, logits,
                                             arange)
-        selected_token_ids = self.sample_from_logits(logits,
-                                                     tpu_sampling_metadata)
-
+        selected_token_ids = self.sample_from_logits_func(
+            logits, tpu_sampling_metadata)
         # NOTE (NickLucche) Use the original logits (before any penalties or
         # temperature scaling) for the top-k logprobs. We can't enforce it due
         # to recompilations outside torch.compiled code, so just make sure
@@ -935,18 +954,26 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                 "vllm.model_executor.layers.vocab_parallel_embedding."
                 "get_tensor_model_parallel_rank",
                 return_value=xm_tp_rank):
-            # model = get_model(vllm_config=self.vllm_config)
-            model_loader = get_model_loader(self.load_config)
-            if not hasattr(self, "model"):
-                logger.info("Loading model from scratch...")
-                model = model_loader.load_model(vllm_config=self.vllm_config,
-                                                model_config=self.model_config)
+            if self.use_spmd:
+                tpu_loader = TPUModelLoader(
+                    load_config=self.vllm_config.load_config)
+                model = tpu_loader.load_model(
+                    vllm_config=self.vllm_config,
+                    model_config=self.vllm_config.model_config,
+                    mesh=self.mesh)
             else:
-                logger.info(
-                    "Model was already initialized. Loading weights inplace..."
-                )
-                model_loader.load_weights(self.model,
-                                          model_config=self.model_config)
+                # model = get_model(vllm_config=self.vllm_config)
+                model_loader = get_model_loader(self.load_config)
+                if not hasattr(self, "model"):
+                    logger.info("Loading model from scratch...")
+                    model = model_loader.load_model(
+                        vllm_config=self.vllm_config,
+                        model_config=self.model_config)
+                else:
+                    logger.info("Model was already initialized. \
+                            Loading weights inplace...")
+                    model_loader.load_weights(self.model,
+                                              model_config=self.model_config)
         if self.lora_config is not None:
             model = self.load_lora_model(model, self.model_config,
                                          self.scheduler_config,
@@ -970,31 +997,25 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                                         device=self.device)
         else:
             input_ids = torch.zeros((num_tokens),
-                                    dtype=torch.int32,
-                                    device=self.device)
+                                    dtype=torch.int32).to(self.device)
             inputs_embeds = None
         actual_num_reqs = min(num_tokens, self.max_num_reqs)
         position_ids = torch.zeros(num_tokens,
-                                   dtype=torch.int32,
-                                   device=self.device)
+                                   dtype=torch.int32).to(self.device)
         slot_mapping = torch.zeros(num_tokens,
-                                   dtype=torch.int64,
-                                   device=self.device)
+                                   dtype=torch.int64).to(self.device)
         block_tables = torch.zeros(
             (self.max_num_reqs, self.block_table_cpu.shape[1]),
-            dtype=torch.int32,
-            device=self.device)
+            dtype=torch.int32).to(self.device)
         query_lens = [1] * self.max_num_reqs
         query_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
                                                     dtype=torch.int32),
                                        dim=0,
                                        dtype=torch.int32).to(self.device)
         context_lens = torch.ones((self.max_num_reqs, ),
-                                  dtype=torch.int32,
-                                  device=self.device)
+                                  dtype=torch.int32).to(self.device)
         num_seqs = torch.tensor([actual_num_reqs],
-                                dtype=torch.int32,
-                                device=self.device)
+                                dtype=torch.int32).to(self.device)
         attn_metadata = PallasMetadata(
             slot_mapping=slot_mapping,
             block_tables=block_tables,
@@ -1198,7 +1219,8 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                 with self.maybe_select_dummy_loras(
                         self.lora_config, np.array([num_reqs],
                                                    dtype=np.int32)):
-                    self.sample_from_logits(dummy_logits, sampling_metadata)
+                    self.sample_from_logits_func(dummy_logits,
+                                                 sampling_metadata)
             logger.info("  -- num_seqs: %d", num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
@@ -1332,14 +1354,22 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                 assert tensor_config.size % kv_cache_spec.page_size_bytes == 0
                 num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes
                 if isinstance(kv_cache_spec, AttentionSpec):
+                    if self.use_spmd:
+                        num_kv_heads = kv_cache_spec.num_kv_heads
+                        assert self.original_parallel_config is not None
+                        tp_size = \
+                            self.original_parallel_config.tensor_parallel_size
+                        # TODO: Handle kv cache duplication under SPMD mode.
+                        assert num_kv_heads % tp_size == 0, (
+                            f"num_kv_heads {num_kv_heads} must be divisible by "
+                            f"tp_size {tp_size} under SPMD mode")
                     kv_cache_shape = PallasAttentionBackend.get_kv_cache_shape(
                         num_blocks, kv_cache_spec.block_size,
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
                     dtype = kv_cache_spec.dtype
 
                     tpu_kv_cache = torch.zeros(kv_cache_shape,
-                                               dtype=dtype,
-                                               device=self.device)
+                                               dtype=dtype).to(self.device)
 
                     kv_caches[layer_name] = tpu_kv_cache
                 else:
@@ -1350,6 +1380,11 @@ class TPUModelRunner(LoRAModelRunnerMixin):
             self.vllm_config.compilation_config.static_forward_context,
             self.kv_caches)
 
+        if self.use_spmd:
+            # Shard KV Cache
+            for cache in self.kv_caches:
+                xs.mark_sharding(cache, self.mesh, (None, 'x', None, None))
+
     def reset_dynamo_cache(self):
         if self.is_multimodal_model:
             compiled_model = self.model.get_language_model().model
@@ -1370,7 +1405,9 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                        sample_hidden_states: torch.Tensor) -> torch.Tensor:
         return self.model.compute_logits(sample_hidden_states, None)
 
-    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    # TODO: Under SPMD mode, sample_from_logits has correctness issue.
+    #       Re-enable the torch.compile once the issue is fixed in torchxla.
+    # @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
     def sample_from_logits(
             self, logits: torch.Tensor,
             sampling_metadata: TPUSupportedSamplingMetadata) -> torch.Tensor:
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 0707e17af..bf0a5777c 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -45,6 +45,15 @@ class TPUWorker:
         self.lora_config = vllm_config.lora_config
         self.load_config = vllm_config.load_config
         self.parallel_config = vllm_config.parallel_config
+        self.use_spmd = envs.VLLM_XLA_USE_SPMD
+        self.original_parallel_config = None
+        if self.use_spmd:
+            # Under SPMD mode, distributed env is initialized as if there is
+            # only one worker/device.
+            self.original_parallel_config = self.parallel_config
+            self.parallel_config.tensor_parallel_size = 1
+            self.parallel_config.pipeline_parallel_size = 1
+            self.parallel_config.world_size = 1
         self.scheduler_config = vllm_config.scheduler_config
         self.device_config = vllm_config.device_config
         self.speculative_config = vllm_config.speculative_config
@@ -95,10 +104,9 @@ class TPUWorker:
         torch.set_default_dtype(self.model_config.dtype)
 
         # Initialize the distributed environment.
-        init_tpu_worker_distributed_environment(self.parallel_config,
-                                                self.rank,
-                                                self.distributed_init_method,
-                                                self.local_rank)
+        self._init_tpu_worker_distributed_environment(
+            self.parallel_config, self.rank, self.distributed_init_method,
+            self.local_rank)
 
         # Device initialization should happen after initializing
         # the distributed runtime.
@@ -132,7 +140,9 @@ class TPUWorker:
             xr.initialize_cache(per_rank_path, readonly=False)
 
         # Init ModelRunner here, so that we have access to self.device.
-        self.model_runner = TPUModelRunner(self.vllm_config, self.device)
+        self.model_runner = \
+            TPUModelRunner(self.vllm_config, self.device,
+                           self.original_parallel_config)
 
         if rank == 0:
             # If usage stat is enabled, collect relevant info.
@@ -147,9 +157,7 @@ class TPUWorker:
 
                 # Use an empty tensor instead of `None`` to force Dynamo to pass
                 # it by reference, rather by specializing on the value ``None``.
-                tpu_kv_cache = torch.tensor([],
-                                            dtype=dtype,
-                                            device=self.device)
+                tpu_kv_cache = torch.tensor([], dtype=dtype).to(self.device)
                 kv_caches[layer_name] = tpu_kv_cache
             else:
                 raise NotImplementedError(
@@ -178,9 +186,20 @@ class TPUWorker:
 
         # Get the maximum amount of memory used by the model weights and
         # intermediate activations.
-        m = xm.get_memory_info(self.device)
-        total_memory_size = m["bytes_limit"]
-        current_mem = m["bytes_used"]
+        if self.use_spmd:
+            # This is a workaround for the TPU SPMD mode. The get_memory_info
+            # API doesn't work with SPMD mode in PyTorch/XLA.
+            # TODO: use xm.get_memory_info for SPMD once it's supported in
+            # PyTorch/XLA.
+            import tpu_info
+            chip_type, _ = tpu_info.device.get_local_chips()
+            device_usage = tpu_info.metrics.get_chip_usage(chip_type)
+            total_memory_size = device_usage[0].total_memory
+            current_mem = device_usage[0].memory_usage
+        else:
+            m = xm.get_memory_info(self.device)
+            total_memory_size = m["bytes_limit"]
+            current_mem = m["bytes_used"]
         # Ideally we would use profiled = m["peak_bytes_used"] to
         # get weights + activations. But there is memory used during
         # compilation / weight loading that impacts the peak and
@@ -241,28 +260,30 @@ class TPUWorker:
         # worker will always be healthy as long as it's running.
         return
 
-
-def init_tpu_worker_distributed_environment(
-    parallel_config: ParallelConfig,
-    rank: int,
-    distributed_init_method: Optional[str] = None,
-    local_rank: int = -1,
-) -> None:
-    """Initialize the distributed environment."""
-
-    # NOTE(woosuk): This is just to initialize the TP group and broadcast
-    # the input objects on CPU. The all-reduce and all-gather ops on TPU
-    # are invoked by `xm.all_reduce` and `xm.all_gather` which use their
-    # own context.
-    init_distributed_environment(
-        world_size=parallel_config.world_size,
-        rank=rank,
-        local_rank=local_rank,
-        distributed_init_method=distributed_init_method,
-        backend="gloo",
-    )
-    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size)
+    def _init_tpu_worker_distributed_environment(
+        self,
+        parallel_config: ParallelConfig,
+        rank: int,
+        distributed_init_method: Optional[str] = None,
+        local_rank: int = -1,
+    ) -> None:
+        """Initialize the distributed environment."""
+        if self.use_spmd:
+            xr.use_spmd()
+        # NOTE(woosuk): This is just to initialize the TP group and broadcast
+        # the input objects on CPU. The all-reduce and all-gather ops on TPU
+        # are invoked by `xm.all_reduce` and `xm.all_gather` which use their
+        # own context.
+        init_distributed_environment(
+            world_size=parallel_config.world_size,
+            rank=rank,
+            local_rank=local_rank,
+            distributed_init_method=distributed_init_method,
+            backend="gloo",
+        )
+        ensure_model_parallel_initialized(
+            parallel_config.tensor_parallel_size,
+            parallel_config.pipeline_parallel_size)
 
 
 try:
-- 
GitLab


From 5bc1ad6cee754405464a9957e86cf3a9302e4986 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?=
 <ohg3417@gmail.com>
Date: Tue, 3 Jun 2025 11:49:48 +0900
Subject: [PATCH 118/274] [Doc] Remove duplicate TOCs during MkDocs migration
 (#19021)

Signed-off-by: Zerohertz <ohg3417@gmail.com>
---
 docs/cli/README.md       | 13 -------------
 docs/deployment/nginx.md | 10 ----------
 2 files changed, 23 deletions(-)

diff --git a/docs/cli/README.md b/docs/cli/README.md
index 5feb316d6..f43ce7663 100644
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@@ -12,19 +12,6 @@ Available Commands:
 vllm {chat,complete,serve,bench,collect-env,run-batch}
 ```
 
-## Table of Contents
-
-- [serve](#serve)
-- [chat](#chat)
-- [complete](#complete)
-- [bench](#bench)
-  - [latency](#latency)
-  - [serve](#serve-1)
-  - [throughput](#throughput)
-- [collect-env](#collect-env)
-- [run-batch](#run-batch)
-- [More Help](#more-help)
-
 ## serve
 
 Start the vLLM OpenAI Compatible API server.
diff --git a/docs/deployment/nginx.md b/docs/deployment/nginx.md
index 80242919b..f0ff5c1d0 100644
--- a/docs/deployment/nginx.md
+++ b/docs/deployment/nginx.md
@@ -5,16 +5,6 @@ title: Using Nginx
 
 This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
 
-Table of contents:
-
-1. [Build Nginx Container][nginxloadbalancer-nginx-build]
-2. [Create Simple Nginx Config file][nginxloadbalancer-nginx-conf]
-3. [Build vLLM Container][nginxloadbalancer-nginx-vllm-container]
-4. [Create Docker Network][nginxloadbalancer-nginx-docker-network]
-5. [Launch vLLM Containers][nginxloadbalancer-nginx-launch-container]
-6. [Launch Nginx][nginxloadbalancer-nginx-launch-nginx]
-7. [Verify That vLLM Servers Are Ready][nginxloadbalancer-nginx-verify-nginx]
-
 [](){ #nginxloadbalancer-nginx-build }
 
 ## Build Nginx Container
-- 
GitLab


From 8a57872b2ac9b01004ae1d3a3a689de218ea5be5 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 2 Jun 2025 23:36:51 -0400
Subject: [PATCH 119/274] [Bugfix][EP+DP] Use pplx-kernel internode instead of
 intranode (#19034)

Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/distributed/device_communicators/all2all.py | 4 ++++
 vllm/model_executor/layers/fused_moe/layer.py    | 6 +++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index a250ec89c..7177754a3 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -83,6 +83,10 @@ class PPLXAll2AllManager(All2AllManagerBase):
         assert has_pplx, "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels."  # noqa
         super().__init__(cpu_group)
 
+        # TODO(tms): Disable pplx-a2a intranode as it fails with the error:
+        # failed: cuda error /app/pplx/csrc/all_to_all/intranode.cpp:84 'invalid resource handle' # noqa
+        self.internode = True
+
         if self.internode:
             # inter-node communication needs nvshmem,
             # intra-node communication uses p2p mapping directly
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index af7b98e14..1e193c909 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -269,9 +269,13 @@ class FusedMoEMethodBase(QuantizeMethodBase):
                 hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else (
                     (moe.hidden_dim + moe.block_size - 1) // moe.block_size *
                     torch.float32.itemsize)),
-                group_name=all2all_manager.cpu_group.group_name,
             )
 
+            # Intranode pplx a2a takes a group name while internode does not.
+            if not all2all_manager.internode:
+                all_to_all_args[
+                    "group_name"] = all2all_manager.cpu_group.group_name
+
             handle = all2all_manager.get_handle(all_to_all_args)
 
             prepare_finalize = PplxPrepareAndFinalize(
-- 
GitLab


From 4ce42f92042ef8a24e925fc7121f7c98e51f73ba Mon Sep 17 00:00:00 2001
From: Concurrensee <yidawu@alumni.cmu.edu>
Date: Mon, 2 Jun 2025 22:46:44 -0500
Subject: [PATCH 120/274] Adding "LoRA Test %N" to AMD production tests
 (#18929)

Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
---
 .buildkite/scripts/hardware_ci/run-amd-test.sh | 4 ++++
 .buildkite/test-pipeline.yaml                  | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index bbc896ec6..6e9af1e72 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
   commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
 
+if [[ $commands == *"pytest -v -s lora"* ]]; then
+  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
+fi
+
 #ignore certain kernels tests
 if [[ $commands == *" kernels/core"* ]]; then
   commands="${commands} \
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index bff2f69c1..5fb8ceaac 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -287,7 +287,7 @@ steps:
     - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 
 - label: LoRA Test %N # 15min each
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-- 
GitLab


From 8655f47f37750eb5d00992d39305d6705659983f Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Tue, 3 Jun 2025 11:46:47 +0800
Subject: [PATCH 121/274] [CPU][CI] Re-enable the CPU CI tests (#19046)

Signed-off-by: jiang.li <jiang1.li@intel.com>
---
 .../scripts/hardware_ci/run-cpu-test.sh       | 42 +++++++++----------
 docker/Dockerfile.cpu                         | 10 +++--
 vllm/distributed/parallel_state.py            |  3 +-
 3 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 40f3df960..0a1193560 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -8,67 +8,65 @@ set -ex
 CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 
+export CMAKE_BUILD_PARALLEL_LEVEL=32
+
 # Setup cleanup
 remove_docker_container() { 
     set -e; 
-    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
-    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
+    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; 
 }
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
+ --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
+ --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
 
 function cpu_tests() {
   set -e
   export NUMA_NODE=$2
-  export BUILDKITE_BUILD_NUMBER=$3
 
   # offline inference
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
     set -e
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 
   # Run basic model test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    pytest -v -s tests/kernels/test_cache.py -m cpu_model
-    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
-    pytest -v -s tests/models/decoder_only/language -m cpu_model
-    pytest -v -s tests/models/embedding/language -m cpu_model
-    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
-    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
-    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
+    pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+    pytest -v -s tests/models/language/generation -m cpu_model
+    pytest -v -s tests/models/language/pooling -m cpu_model
+    pytest -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_mllama.py -m cpu_model"
 
   # Run compressed-tensor test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 
   # Run AWQ test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/quantization/test_ipex_quant.py"
 
   # Run chunked-prefill and prefix-cache test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v -k cpu_model \
     tests/basic_correctness/test_chunked_prefill.py"  
 
   # online serving
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     export VLLM_CPU_KVCACHE_SPACE=10 
     export VLLM_CPU_OMP_THREADS_BIND=$1
@@ -83,7 +81,7 @@ function cpu_tests() {
       --tokenizer facebook/opt-125m"
 
   # Run multi-lora tests
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/lora/test_qwen2vl.py"
@@ -91,4 +89,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
+timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 5395b3884..6db2f307a 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -75,6 +75,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
     --mount=type=bind,source=.git,target=.git \
     VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel 
 
@@ -85,7 +86,7 @@ WORKDIR /workspace/vllm
 
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt-get install -y --no-install-recommends vim numactl
+    apt-get install -y --no-install-recommends vim numactl xz-utils
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -108,8 +109,11 @@ FROM base AS vllm-test
 WORKDIR /workspace/
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \
-    uv pip install -r requirements/test.txt
+    --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
+    cp requirements/test.in requirements/test-cpu.in && \
+    sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
+    uv pip compile requirements/test-cpu.in -o requirements/cpu-test.txt && \
+    uv pip install -r requirements/cpu-test.txt
 
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 6e48c02da..32c9301bf 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1203,7 +1203,8 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
     if empty_cache is not None:
         empty_cache()
     try:
-        torch._C._host_emptyCache()
+        if not current_platform.is_cpu():
+            torch._C._host_emptyCache()
     except AttributeError:
         logger.warning(
             "torch._C._host_emptyCache() only available in Pytorch >=2.5")
-- 
GitLab


From 9e6f61e8c3df833537e4bea6c33f85eca5d73b15 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Mon, 2 Jun 2025 23:47:47 -0400
Subject: [PATCH 122/274] [ROCm][Build] Clean up the ROCm build (#19040)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 CMakeLists.txt                                  |  4 ----
 docker/Dockerfile.rocm                          | 17 -----------------
 .../installation/gpu/rocm.inc.md                |  2 --
 requirements/rocm.txt                           |  2 ++
 4 files changed, 2 insertions(+), 23 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6536e9a57..87aa23c08 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -182,9 +182,6 @@ include(FetchContent)
 file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
 message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 
-#
-# Set rocm version dev int.
-#
 if(VLLM_GPU_LANG STREQUAL "HIP")
   #
   # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
@@ -192,7 +189,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
   set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
 
-
   #
   # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
   # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index e60cf5e69..b186f88d2 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -1,7 +1,5 @@
 # default base image
 ARG REMOTE_VLLM="0"
-ARG USE_CYTHON="0"
-ARG BUILD_RPD="1"
 ARG COMMON_WORKDIR=/app
 ARG BASE_IMAGE=rocm/vllm-dev:base
 
@@ -36,12 +34,10 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 # -----------------------
 # vLLM build stages
 FROM fetch_vllm AS build_vllm
-ARG USE_CYTHON
 # Build vLLM
 RUN cd vllm \
     && python3 -m pip install -r requirements/rocm.txt \
     && python3 setup.py clean --all  \
-    && if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \
     && python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
@@ -90,13 +86,6 @@ RUN case "$(which python3)" in \
         *) ;; esac
 
 RUN python3 -m pip install --upgrade huggingface-hub[cli]
-ARG BUILD_RPD
-RUN if [ ${BUILD_RPD} -eq "1" ]; then \
-    git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \
-    && cd rocmProfileData/rpd_tracer \
-    && pip install -r requirements.txt && cd ../ \
-    && make && make install \
-    && cd hipMarker && python3 setup.py install ; fi
 
 # Install vLLM
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
@@ -117,12 +106,6 @@ ENV TOKENIZERS_PARALLELISM=false
 # ENV that can improve safe tensor loading, and end-to-end time
 ENV SAFETENSORS_FAST_GPU=1
 
-# User-friendly environment setting for multi-processing to avoid below RuntimeError.
-# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing,
-# you must use the 'spawn' start method 
-# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-
 # Performance environment variable.
 ENV HIP_FORCE_DEV_KERNARG=1
 
diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md
index 0029b3a24..8b7dc6dd0 100644
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@@ -179,8 +179,6 @@ It is important that the user kicks off the docker build using buildkit. Either
 It provides flexibility to customize the build of docker image using the following arguments:
 
 - `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:docker/Dockerfile.rocm_base>
-- `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build
-- `BUILD_RPD`: Include RocmProfileData profiling tool in the image
 - `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
 
 Their values can be passed in when running `docker build` with `--build-arg` options.
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 8a84f2ff1..fb1febdac 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -12,5 +12,7 @@ ray>=2.10.0,<2.45.0
 peft
 pytest-asyncio
 tensorizer>=2.9.0
+setuptools-scm>=8
+setuptools>=77.0.3,<80.0.0
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
-- 
GitLab


From bdce64f2365b39335141f8efcb3a0a8ecc559153 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Mon, 2 Jun 2025 21:15:13 -0700
Subject: [PATCH 123/274] [V1] Support DP with Ray (#18779)

---
 requirements/test.in          |   2 +-
 requirements/test.txt         |  50 +++++++
 tests/v1/test_async_llm_dp.py |  13 +-
 vllm/config.py                |   6 +
 vllm/engine/arg_utils.py      |  29 +++-
 vllm/entrypoints/cli/serve.py |  35 ++++-
 vllm/v1/engine/async_llm.py   |  13 +-
 vllm/v1/engine/core.py        | 168 ++++++++++++++++------
 vllm/v1/engine/core_client.py |  74 ++++++++--
 vllm/v1/utils.py              | 257 +++++++++++++++++++++++++++++-----
 10 files changed, 539 insertions(+), 108 deletions(-)

diff --git a/requirements/test.in b/requirements/test.in
index e906752ff..9b574a09f 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -17,7 +17,7 @@ vector_quantize_pytorch # required for minicpmo_26 test
 vocos # required for minicpmo_26 test
 peft
 pqdm
-ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
+ray[cgraph,default]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
diff --git a/requirements/test.txt b/requirements/test.txt
index 60dcaca81..03aec80ac 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -10,9 +10,13 @@ aiohappyeyeballs==2.4.3
     # via aiohttp
 aiohttp==3.10.11
     # via
+    #   aiohttp-cors
     #   datasets
     #   fsspec
     #   lm-eval
+    #   ray
+aiohttp-cors==0.8.1
+    # via ray
 aiosignal==1.3.1
     # via
     #   aiohttp
@@ -57,6 +61,8 @@ bounded-pool-executor==0.0.3
     # via pqdm
 buildkite-test-collector==0.1.9
     # via -r requirements/test.in
+cachetools==5.5.2
+    # via google-auth
 certifi==2024.8.30
     # via
     #   httpcore
@@ -81,6 +87,8 @@ colorama==0.4.6
     #   sacrebleu
     #   schemathesis
     #   tqdm-multiprocess
+colorful==0.5.6
+    # via ray
 contourpy==1.3.0
     # via matplotlib
 cramjam==2.9.0
@@ -108,6 +116,8 @@ dill==0.3.8
     #   evaluate
     #   lm-eval
     #   multiprocess
+distlib==0.3.9
+    # via virtualenv
 dnspython==2.7.0
     # via email-validator
 docopt==0.6.2
@@ -143,6 +153,7 @@ filelock==3.16.1
     #   ray
     #   torch
     #   transformers
+    #   virtualenv
 fonttools==4.54.1
     # via matplotlib
 fqdn==1.5.1
@@ -165,8 +176,16 @@ genai-perf==0.0.8
     # via -r requirements/test.in
 genson==1.3.0
     # via datamodel-code-generator
+google-api-core==2.24.2
+    # via opencensus
+google-auth==2.40.2
+    # via google-api-core
+googleapis-common-protos==1.70.0
+    # via google-api-core
 graphql-core==3.2.6
     # via hypothesis-graphql
+grpcio==1.71.0
+    # via ray
 h11==0.14.0
     # via httpcore
 harfile==0.3.0
@@ -392,6 +411,10 @@ nvidia-nvjitlink-cu12==12.8.61
     #   torch
 nvidia-nvtx-cu12==12.8.55
     # via torch
+opencensus==0.11.4
+    # via ray
+opencensus-context==0.1.3
+    # via opencensus
 opencv-python-headless==4.11.0.86
     # via
     #   -r requirements/test.in
@@ -445,6 +468,7 @@ platformdirs==4.3.6
     # via
     #   black
     #   pooch
+    #   virtualenv
 plotly==5.24.1
     # via genai-perf
 pluggy==1.5.0
@@ -457,10 +481,17 @@ portalocker==2.10.1
     # via sacrebleu
 pqdm==0.2.0
     # via -r requirements/test.in
+prometheus-client==0.22.0
+    # via ray
 propcache==0.2.0
     # via yarl
+proto-plus==1.26.1
+    # via google-api-core
 protobuf==5.28.3
     # via
+    #   google-api-core
+    #   googleapis-common-protos
+    #   proto-plus
     #   ray
     #   tensorizer
 psutil==6.1.0
@@ -470,10 +501,18 @@ psutil==6.1.0
     #   tensorizer
 py==1.11.0
     # via pytest-forked
+py-spy==0.4.0
+    # via ray
 pyarrow==18.0.0
     # via
     #   datasets
     #   genai-perf
+pyasn1==0.6.1
+    # via
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.4.2
+    # via google-auth
 pybind11==2.13.6
     # via lm-eval
 pycparser==2.22
@@ -486,6 +525,7 @@ pydantic==2.11.5
     #   datamodel-code-generator
     #   mistral-common
     #   mteb
+    #   ray
 pydantic-core==2.33.2
     # via pydantic
 pygments==2.18.0
@@ -573,6 +613,7 @@ requests==2.32.3
     #   buildkite-test-collector
     #   datasets
     #   evaluate
+    #   google-api-core
     #   huggingface-hub
     #   lm-eval
     #   mistral-common
@@ -601,6 +642,8 @@ rpds-py==0.20.1
     # via
     #   jsonschema
     #   referencing
+rsa==4.9.1
+    # via google-auth
 runai-model-streamer==0.11.0
     # via -r requirements/test.in
 runai-model-streamer-s3==0.11.0
@@ -648,9 +691,12 @@ shellingham==1.5.4
 six==1.16.0
     # via
     #   junit-xml
+    #   opencensus
     #   python-dateutil
     #   rfc3339-validator
     #   rouge-score
+smart-open==7.1.0
+    # via ray
 sniffio==1.3.1
     # via
     #   anyio
@@ -801,6 +847,8 @@ urllib3==2.2.3
     #   tritonclient
 vector-quantize-pytorch==1.21.2
     # via -r requirements/test.in
+virtualenv==20.31.2
+    # via ray
 vocos==0.1.0
     # via -r requirements/test.in
 webcolors==24.11.1
@@ -809,6 +857,8 @@ werkzeug==3.1.3
     # via schemathesis
 word2number==1.1
     # via lm-eval
+wrapt==1.17.2
+    # via smart-open
 xxhash==3.5.0
     # via
     #   datasets
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
index ce4c4d198..366fa3b25 100644
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -59,14 +59,22 @@ async def generate(engine: AsyncLLM,
 
 
 @pytest.mark.parametrize(
-    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+    "output_kind",
+    [
+        RequestOutputKind.DELTA,
+        RequestOutputKind.FINAL_ONLY,
+    ],
+)
+@pytest.mark.parametrize("data_parallel_backend", ["mp", "ray"])
 @pytest.mark.asyncio
-async def test_load(output_kind: RequestOutputKind):
+async def test_load(output_kind: RequestOutputKind,
+                    data_parallel_backend: str):
 
     with ExitStack() as after:
 
         prompt = "This is a test of data parallel"
 
+        engine_args.data_parallel_backend = data_parallel_backend
         engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
 
@@ -82,7 +90,6 @@ async def test_load(output_kind: RequestOutputKind):
                 asyncio.create_task(
                     generate(engine, request_id, prompt, output_kind,
                              NUM_EXPECTED_TOKENS)))
-
         # Confirm that we got all the EXPECTED tokens from the requests.
         done, pending = await asyncio.wait(tasks,
                                            return_when=asyncio.FIRST_EXCEPTION)
diff --git a/vllm/config.py b/vllm/config.py
index 1bd53e35b..8aa1b5610 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1742,6 +1742,8 @@ class ParallelConfig:
     """Port for data parallel messaging."""
     data_parallel_master_port: int = 29500
     """Port of the data parallel master."""
+    data_parallel_backend: str = "mp"
+    """Backend to use for data parallel, either "mp" or "ray"."""
     enable_expert_parallel: bool = False
     """Use expert parallelism instead of tensor parallelism for MoE layers."""
     max_parallel_loading_workers: Optional[int] = None
@@ -1911,6 +1913,10 @@ class ParallelConfig:
                                      "please install Ray with `pip install "
                                      "ray`.") from ray_utils.ray_import_err
                 backend = "ray"
+            elif self.data_parallel_backend == "ray":
+                logger.info("Using ray distributed inference because "
+                            "data_parallel_backend is ray")
+                backend = "ray"
             elif ray_found:
                 if self.placement_group:
                     backend = "ray"
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 299c8347f..a5b155024 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -39,7 +39,7 @@ from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
-                        GiB_bytes, is_in_ray_actor)
+                        GiB_bytes, get_ip, is_in_ray_actor)
 
 # yapf: enable
 
@@ -292,6 +292,7 @@ class EngineArgs:
     data_parallel_size_local: Optional[int] = None
     data_parallel_address: Optional[str] = None
     data_parallel_rpc_port: Optional[int] = None
+    data_parallel_backend: str = ParallelConfig.data_parallel_backend
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
     max_parallel_loading_workers: Optional[
         int] = ParallelConfig.max_parallel_loading_workers
@@ -624,6 +625,12 @@ class EngineArgs:
                                     type=int,
                                     help='Port for data parallel RPC '
                                     'communication.')
+        parallel_group.add_argument('--data-parallel-backend',
+                                    '-dpb',
+                                    type=str,
+                                    default='mp',
+                                    help='Backend for data parallel, either '
+                                    '"mp" or "ray".')
         parallel_group.add_argument(
             "--enable-expert-parallel",
             **parallel_kwargs["enable_expert_parallel"])
@@ -1059,9 +1066,20 @@ class EngineArgs:
 
         # DP address, used in multi-node case for torch distributed group
         # and ZMQ sockets.
-        data_parallel_address = self.data_parallel_address if (
-            self.data_parallel_address
-            is not None) else ParallelConfig.data_parallel_master_ip
+        if self.data_parallel_address is None:
+            if self.data_parallel_backend == "ray":
+                host_ip = get_ip()
+                logger.info(
+                    "Using host IP %s as ray-based data parallel address",
+                    host_ip)
+                data_parallel_address = host_ip
+            else:
+                assert self.data_parallel_backend == "mp", (
+                    "data_parallel_backend can only be ray or mp, got %s",
+                    self.data_parallel_backend)
+                data_parallel_address = ParallelConfig.data_parallel_master_ip
+        else:
+            data_parallel_address = self.data_parallel_address
 
         # This port is only used when there are remote data parallel engines,
         # otherwise the local IPC transport is used.
@@ -1069,6 +1087,8 @@ class EngineArgs:
             self.data_parallel_rpc_port
             is not None) else ParallelConfig.data_parallel_rpc_port
 
+        data_parallel_backend = self.data_parallel_backend
+
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
@@ -1076,6 +1096,7 @@ class EngineArgs:
             data_parallel_size_local=data_parallel_size_local,
             data_parallel_master_ip=data_parallel_address,
             data_parallel_rpc_port=data_parallel_rpc_port,
+            data_parallel_backend=data_parallel_backend,
             enable_expert_parallel=self.enable_expert_parallel,
             max_parallel_loading_workers=self.max_parallel_loading_workers,
             disable_custom_all_reduce=self.disable_custom_all_reduce,
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index e65c97073..040ae166a 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -27,7 +27,8 @@ from vllm.v1.engine.core_client import CoreEngineProcManager
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
 from vllm.v1.utils import (APIServerProcessManager, CoreEngine,
-                           EngineZmqAddresses, get_engine_client_zmq_addr,
+                           CoreEngineActorManager, EngineZmqAddresses,
+                           get_engine_client_zmq_addr,
                            wait_for_completion_or_failure,
                            wait_for_engine_startup)
 
@@ -229,6 +230,31 @@ def run_multi_api_server(args: argparse.Namespace):
         logger.info("Started DP Coordinator process (PID: %d)",
                     coordinator.proc.pid)
 
+    if parallel_config.data_parallel_backend == "ray":
+        logger.info("Starting ray-based data parallel backend")
+
+        engine_actor_manager = CoreEngineActorManager(
+            vllm_config=vllm_config,
+            addresses=addresses,
+            executor_class=Executor.get_class(vllm_config),
+            log_stats=not engine_args.disable_log_stats,
+        )
+        # Start API servers using the manager
+        api_server_manager = APIServerProcessManager(
+            target_server_fn=run_api_server_worker_proc,
+            listen_address=listen_address,
+            sock=sock,
+            args=args,
+            num_servers=num_api_servers,
+            input_addresses=input_addresses,
+            output_addresses=output_addresses,
+            stats_update_address=stats_update_address)
+
+        wait_for_completion_or_failure(api_server_manager=api_server_manager,
+                                       engine_manager=engine_actor_manager,
+                                       coordinator=coordinator)
+        return
+
     handshake_address = get_engine_client_zmq_addr(
         local_only, host, parallel_config.data_parallel_rpc_port)
 
@@ -277,10 +303,9 @@ def run_multi_api_server(args: argparse.Namespace):
         )
 
         # Wait for API servers
-        wait_for_completion_or_failure(
-            api_server_manager=api_server_manager,
-            local_engine_manager=local_engine_manager,
-            coordinator=coordinator)
+        wait_for_completion_or_failure(api_server_manager=api_server_manager,
+                                       engine_manager=local_engine_manager,
+                                       coordinator=coordinator)
 
 
 def run_api_server_worker_proc(listen_address,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 86781e752..4b235c596 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -27,7 +27,8 @@ from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Device, cdiv
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.core_client import AsyncMPClient, DPAsyncMPClient
+from vllm.v1.engine.core_client import (AsyncMPClient, DPAsyncMPClient,
+                                        RayDPClient)
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 from vllm.v1.engine.output_processor import (OutputProcessor,
                                              RequestOutputCollector)
@@ -119,9 +120,13 @@ class AsyncLLM(EngineClient):
                                                 log_stats=self.log_stats)
 
         # EngineCore (starts the engine in background process).
-        core_client_class = AsyncMPClient if (
-            vllm_config.parallel_config.data_parallel_size
-            == 1) else DPAsyncMPClient
+        core_client_class: type[AsyncMPClient]
+        if vllm_config.parallel_config.data_parallel_size == 1:
+            core_client_class = AsyncMPClient
+        elif vllm_config.parallel_config.data_parallel_backend == "ray":
+            core_client_class = RayDPClient
+        else:
+            core_client_class = DPAsyncMPClient
 
         self.engine_core = core_client_class(
             vllm_config=vllm_config,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index a02abb62b..7253d1dc6 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -6,8 +6,9 @@ import sys
 import threading
 import time
 from collections import deque
+from collections.abc import Generator
 from concurrent.futures import Future
-from contextlib import ExitStack
+from contextlib import ExitStack, contextmanager
 from inspect import isclass, signature
 from logging import DEBUG
 from typing import Any, Callable, Optional, TypeVar, Union
@@ -367,60 +368,35 @@ class EngineCoreProc(EngineCore):
         log_stats: bool,
         engine_index: int = 0,
     ):
-        input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
-
-        executor_fail_callback = lambda: input_queue.put_nowait(
+        self.input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
+        self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs],
+                                              bytes]]()
+        executor_fail_callback = lambda: self.input_queue.put_nowait(
             (EngineCoreRequestType.EXECUTOR_FAILED, b''))
 
-        # Create input socket.
-        input_ctx = zmq.Context()
-        identity = engine_index.to_bytes(length=2, byteorder="little")
-        with make_zmq_socket(input_ctx,
-                             handshake_address,
-                             zmq.DEALER,
-                             identity=identity,
-                             linger=5000,
-                             bind=False) as handshake_socket:
+        self.engine_index = engine_index
+        identity = self.engine_index.to_bytes(length=2, byteorder="little")
+        self.engines_running = False
 
-            # Register engine with front-end.
-            addresses = self.startup_handshake(handshake_socket, on_head_node,
-                                               vllm_config.parallel_config)
+        with self._perform_handshake(handshake_address, identity, on_head_node,
+                                     vllm_config) as addresses:
             self.client_count = len(addresses.outputs)
 
-            # Update config which may have changed from the handshake.
-            vllm_config.__post_init__()
-
             # Set up data parallel environment.
             self.has_coordinator = addresses.coordinator_output is not None
             self._init_data_parallel(vllm_config)
 
-            # Initialize engine core and model.
             super().__init__(vllm_config, executor_class, log_stats,
                              executor_fail_callback)
 
-            self.engine_index = engine_index
-            self.step_fn = (self.step if self.batch_queue is None else
-                            self.step_with_batch_queue)
-            self.engines_running = False
-            self.last_counts = (0, 0)
-
-            # Send ready message.
-            num_gpu_blocks = vllm_config.cache_config.num_gpu_blocks
-            handshake_socket.send(
-                msgspec.msgpack.encode({
-                    "status": "READY",
-                    "local": on_head_node,
-                    "num_gpu_blocks": num_gpu_blocks,
-                }))
+        self.step_fn = (self.step if self.batch_queue is None else
+                        self.step_with_batch_queue)
 
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
         # and to overlap some serialization/deserialization with the
         # model forward pass.
         # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        self.input_queue = input_queue
-        self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs],
-                                              bytes]]()
         threading.Thread(target=self.process_input_sockets,
                          args=(addresses.inputs, addresses.coordinator_input,
                                identity),
@@ -428,10 +404,40 @@ class EngineCoreProc(EngineCore):
         self.output_thread = threading.Thread(
             target=self.process_output_sockets,
             args=(addresses.outputs, addresses.coordinator_output,
-                  engine_index),
+                  self.engine_index),
             daemon=True)
         self.output_thread.start()
 
+    @contextmanager
+    def _perform_handshake(
+            self, handshake_address: str, identity: bytes, on_head_node: bool,
+            vllm_config: VllmConfig
+    ) -> Generator[EngineZmqAddresses, None, None]:
+        input_ctx = zmq.Context()
+        with make_zmq_socket(input_ctx,
+                             handshake_address,
+                             zmq.DEALER,
+                             identity=identity,
+                             linger=5000,
+                             bind=False) as handshake_socket:
+            # Register engine with front-end.
+            addresses = self.startup_handshake(handshake_socket, on_head_node,
+                                               vllm_config.parallel_config)
+
+            # Update config which may have changed from the handshake
+            vllm_config.__post_init__()
+
+            yield addresses
+
+            # Send ready message.
+            num_gpu_blocks = vllm_config.cache_config.num_gpu_blocks
+            handshake_socket.send(
+                msgspec.msgpack.encode({
+                    "status": "READY",
+                    "local": on_head_node,
+                    "num_gpu_blocks": num_gpu_blocks,
+                }))
+
     @staticmethod
     def startup_handshake(
             handshake_socket: zmq.Socket, on_head_node: bool,
@@ -743,24 +749,29 @@ class DPEngineCoreProc(EngineCoreProc):
         executor_class: type[Executor],
         log_stats: bool,
     ):
-        # Add process-specific prefix to stdout and stderr before
-        # we initialize the engine.
-        from multiprocessing import current_process
-        process_name = current_process().name
-        pid = os.getpid()
-        _add_prefix(sys.stdout, process_name, pid)
-        _add_prefix(sys.stderr, process_name, pid)
+
+        self._decorate_logs()
 
         # Counts forward-passes of the model so that we can synchronize
         # finished with DP peers every N steps.
         self.counter = 0
         self.current_wave = 0
+        self.last_counts = (0, 0)
 
         # Initialize the engine.
         dp_rank = vllm_config.parallel_config.data_parallel_rank
         super().__init__(vllm_config, on_head_node, handshake_address,
                          executor_class, log_stats, dp_rank)
 
+    def _decorate_logs(self):
+        # Add process-specific prefix to stdout and stderr before
+        # we initialize the engine.
+        from multiprocessing import current_process
+        process_name = current_process().name
+        pid = os.getpid()
+        _add_prefix(sys.stdout, process_name, pid)
+        _add_prefix(sys.stderr, process_name, pid)
+
     def _init_data_parallel(self, vllm_config: VllmConfig):
 
         # Configure GPUs and stateless process group for data parallel.
@@ -880,3 +891,70 @@ class DPEngineCoreProc(EngineCoreProc):
 
         return ParallelConfig.has_unfinished_dp(self.dp_group,
                                                 local_unfinished)
+
+
+class DPEngineCoreActor(DPEngineCoreProc):
+    """
+    Ray actor for running EngineCore in a data parallel context
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        on_head_node: bool,
+        addresses: EngineZmqAddresses,
+        executor_class: type[Executor],
+        log_stats: bool,
+        dp_rank: int = 0,
+        local_dp_rank: int = 0,
+    ):
+        self.addresses = addresses
+        vllm_config.parallel_config.data_parallel_rank = dp_rank
+        vllm_config.parallel_config.data_parallel_rank_local = \
+            local_dp_rank
+
+        # Ray sets CUDA_VISIBLE_DEVICES to empty string,
+        # we clean this up to be able to properly initialize
+        # data parallel groups.
+        del os.environ['CUDA_VISIBLE_DEVICES']
+
+        super().__init__(vllm_config, on_head_node, "", executor_class,
+                         log_stats)
+
+    def _decorate_logs(self):
+        pass
+
+    @contextmanager
+    def _perform_handshake(self, handshake_address: str, identity: bytes,
+                           on_head_node: bool, vllm_config: VllmConfig):
+        """
+        For Ray, we don't need to actually perform handshake.
+        All addresses information is known before the actor creation.
+        Therefore, we simply yield these addresses.
+        """
+        yield self.addresses
+
+    def wait_for_init(self):
+        """
+        Wait until the engine core is initialized.
+
+        This is just an empty method. When ray.get() on this method
+        (or any other method of the actor) returns, it is guaranteed
+        that actor creation (i.e., __init__) is complete.
+        """
+        pass
+
+    def run(self):
+        """
+        Run the engine core busy loop.
+        """
+        try:
+            self.run_busy_loop()
+        except SystemExit:
+            logger.debug("EngineCore exiting.")
+            raise
+        except Exception:
+            logger.exception("EngineCore encountered a fatal error.")
+            raise
+        finally:
+            self.shutdown()
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 232d6742b..fa01998aa 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -29,9 +29,9 @@ from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.engine.exceptions import EngineDeadError
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
-from vllm.v1.utils import (CoreEngine, CoreEngineProcManager,
-                           EngineZmqAddresses, get_engine_client_zmq_addr,
-                           wait_for_engine_startup)
+from vllm.v1.utils import (CoreEngine, CoreEngineActorManager,
+                           CoreEngineProcManager, EngineZmqAddresses,
+                           get_engine_client_zmq_addr, wait_for_engine_startup)
 
 logger = init_logger(__name__)
 
@@ -68,6 +68,8 @@ class EngineCoreClient(ABC):
 
         if multiprocess_mode and asyncio_mode:
             if vllm_config.parallel_config.data_parallel_size > 1:
+                if vllm_config.parallel_config.data_parallel_backend == "ray":
+                    return RayDPClient(vllm_config, executor_class, log_stats)
                 return DPAsyncMPClient(vllm_config, executor_class, log_stats)
 
             return AsyncMPClient(vllm_config, executor_class, log_stats)
@@ -273,7 +275,10 @@ class BackgroundResources:
     circular reference back to the client object."""
 
     ctx: Union[zmq.Context]
-    local_engine_manager: Optional[CoreEngineProcManager] = None
+    # If CoreEngineProcManager, it manages local engines;
+    # if CoreEngineActorManager, it manages all engines.
+    engine_manager: Optional[Union[CoreEngineProcManager,
+                                   CoreEngineActorManager]] = None
     coordinator: Optional[DPCoordinator] = None
     output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
     input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
@@ -290,8 +295,8 @@ class BackgroundResources:
         """Clean up background resources."""
 
         self.engine_dead = True
-        if self.local_engine_manager is not None:
-            self.local_engine_manager.close()
+        if self.engine_manager is not None:
+            self.engine_manager.close()
         if self.coordinator is not None:
             self.coordinator.close()
 
@@ -457,7 +462,7 @@ class MPClient(EngineCoreClient):
             if local_engine_count:
                 # In server mode, start_index and local_start_index will
                 # both be 0.
-                self.resources.local_engine_manager = CoreEngineProcManager(
+                self.resources.engine_manager = CoreEngineProcManager(
                     EngineCoreProc.run_engine_core,
                     vllm_config=vllm_config,
                     executor_class=executor_class,
@@ -484,13 +489,18 @@ class MPClient(EngineCoreClient):
             addresses.coordinator_input, addresses.coordinator_output = (
                 coordinator.get_engine_socket_addresses())
 
+        proc_manager = self.resources.engine_manager
+        assert isinstance(proc_manager, (type(None), CoreEngineProcManager)), (
+            "_wait_for_engine_startup should only be "
+            "called with CoreEngineProcManager")
+
         wait_for_engine_startup(
             handshake_socket,
             addresses,
             self.core_engines,
             self.vllm_config.parallel_config,
             self.vllm_config.cache_config,
-            self.resources.local_engine_manager,
+            proc_manager,
             coordinator.proc if coordinator else None,
         )
 
@@ -887,7 +897,6 @@ class DPAsyncMPClient(AsyncMPClient):
                  log_stats: bool,
                  client_addresses: Optional[dict[str, str]] = None,
                  client_index: int = 0):
-
         self.current_wave = 0
         self.engines_running = False
         # To route aborts to the correct engine.
@@ -1050,3 +1059,50 @@ class DPAsyncMPClient(AsyncMPClient):
         if not self.resources.engine_dead:
             await self._send_input(EngineCoreRequestType.ABORT, request_ids,
                                    engine)
+
+
+class RayDPClient(DPAsyncMPClient):
+    """
+    Ray-based client for multi-proc, multi-engine (data parallel)
+    EngineCore.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_addresses: Optional[dict[str, str]] = None,
+        client_index: int = 0,
+    ):
+        super().__init__(vllm_config, executor_class, log_stats,
+                         client_addresses, client_index)
+
+    def _init_engines_direct(self, vllm_config: VllmConfig, local_only: bool,
+                             local_start_index: int, input_address: str,
+                             output_address: str,
+                             executor_class: type[Executor], log_stats: bool):
+        """Self-contained client mode, launch engine and coordinator process
+        as needed."""
+
+        parallel_config = vllm_config.parallel_config
+        assert parallel_config.data_parallel_rank == 0
+        assert local_start_index == 0
+
+        addresses = EngineZmqAddresses(
+            inputs=[input_address],
+            outputs=[output_address],
+        )
+
+        if len(self.core_engines) > 1:
+            coordinator = DPCoordinator(parallel_config)
+            self.resources.coordinator = coordinator
+            addresses.coordinator_input, addresses.coordinator_output = (
+                coordinator.get_engine_socket_addresses())
+
+        # Start all engines.
+        self.resources.engine_manager = CoreEngineActorManager(
+            vllm_config=vllm_config,
+            addresses=addresses,
+            executor_class=executor_class,
+            log_stats=log_stats)
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index a26794561..d347efc42 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -27,6 +27,8 @@ from vllm.utils import (get_mp_context, get_open_port, get_open_zmq_ipc_path,
 from vllm.v1.executor.abstract import Executor
 
 if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
     from vllm.attention.layer import Attention
     from vllm.v1.engine.coordinator import DPCoordinator
 
@@ -112,6 +114,45 @@ def get_engine_client_zmq_addr(local_only: bool,
         host, port or get_open_port()))
 
 
+class CoreEngineState(Enum):
+    NEW = auto()
+    CONNECTED = auto()
+    READY = auto()
+
+
+class CoreEngine:
+    """One per data parallel rank."""
+
+    def __init__(self, index: int = 0, local: bool = True):
+        self.local = local
+        self.index = index
+        self.identity = index.to_bytes(2, "little")
+
+        self.state = CoreEngineState.NEW
+
+
+@dataclass
+class EngineZmqAddresses:
+    # ZMQ input socket addresses for each front-end client (requests)
+    inputs: list[str]
+    # ZMQ output socket addresses for each front-end client (responses)
+    outputs: list[str]
+    # ZMQ input socket address of DP coordinator if applicable
+    coordinator_input: Optional[str] = None
+    # ZMQ output socket address of DP coordinator if applicable
+    coordinator_output: Optional[str] = None
+
+
+@dataclass
+class EngineHandshakeMetadata:
+    """Metadata sent to each engine process during startup handshake,
+    including addresses of the front-end ZMQ queues that they should
+    connect to.
+    """
+    addresses: EngineZmqAddresses
+    parallel_config: dict[str, Union[int, str]]
+
+
 class APIServerProcessManager:
     """Manages a group of API server processes.
     
@@ -245,43 +286,168 @@ class CoreEngineProcManager:
         }
 
 
-class CoreEngineState(Enum):
-    NEW = auto()
-    CONNECTED = auto()
-    READY = auto()
-
+class CoreEngineActorManager:
+    """
+    Utility class to handle creation, readiness, and shutdown
+    of core engine Ray actors used by the AsyncLLM and LLMEngine.
 
-class CoreEngine:
-    """One per data parallel rank."""
+    Different from CoreEngineProcManager, this class manages
+    core engines for both local and remote nodes.
+    """
 
-    def __init__(self, index: int = 0, local: bool = True):
-        self.local = local
-        self.index = index
-        self.identity = index.to_bytes(2, "little")
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        addresses: EngineZmqAddresses,
+        executor_class: type[Executor],
+        log_stats: bool,
+        placement_groups: Optional[list["PlacementGroup"]] = None,
+        local_dp_ranks: Optional[list[int]] = None,
+    ):
+        import copy
 
-        self.state = CoreEngineState.NEW
+        import ray
+        from ray.util.scheduling_strategies import (
+            PlacementGroupSchedulingStrategy)
 
+        from vllm.v1.engine.core import DPEngineCoreActor
 
-@dataclass
-class EngineZmqAddresses:
-    # ZMQ input socket addresses for each front-end client (requests)
-    inputs: list[str]
-    # ZMQ output socket addresses for each front-end client (responses)
-    outputs: list[str]
-    # ZMQ input socket address of DP coordinator if applicable
-    coordinator_input: Optional[str] = None
-    # ZMQ output socket address of DP coordinator if applicable
-    coordinator_output: Optional[str] = None
+        self.local_engine_actors: list[ray.ActorHandle] = []
+        self.remote_engine_actors: list[ray.ActorHandle] = []
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        local_engine_count = \
+            vllm_config.parallel_config.data_parallel_size_local
+        world_size = vllm_config.parallel_config.world_size
 
+        if ray.is_initialized():
+            logger.info(
+                "Ray is already initialized. Skipping Ray initialization.")
+        else:
+            ray.init()
+
+        if placement_groups is not None:
+            assert local_dp_ranks is not None, (
+                "local_dp_ranks must be provided if "
+                "placement_groups is provided")
+            assert len(placement_groups) == len(local_dp_ranks), (
+                "placement_groups and local_dp_ranks must "
+                "have the same length")
+            logger.info("Using provided placement groups")
+            # TODO(rui): validate passed-in placement groups
+            self.created_placement_groups = []
+        else:
+            placement_groups, local_dp_ranks = \
+                CoreEngineActorManager.create_dp_placement_groups(vllm_config)
+            self.created_placement_groups = placement_groups
+        assert len(placement_groups) == dp_size, (
+            "Number of placement groups must match data parallel size")
+
+        refs = []
+        for index in range(dp_size):
+            local_index = local_dp_ranks[index]
+            dp_vllm_config = copy.deepcopy(vllm_config)
+            pg = placement_groups[index]
+            dp_vllm_config.parallel_config.placement_group = pg
+            on_head_node = index < local_engine_count
+            actor = ray.remote(DPEngineCoreActor).options(
+                scheduling_strategy=PlacementGroupSchedulingStrategy(
+                    placement_group=pg,
+                    placement_group_bundle_index=world_size,
+                )).remote(vllm_config=dp_vllm_config,
+                          executor_class=executor_class,
+                          log_stats=log_stats,
+                          on_head_node=on_head_node,
+                          addresses=addresses,
+                          dp_rank=index,
+                          local_dp_rank=local_index)
+            if on_head_node:
+                self.local_engine_actors.append(actor)
+            else:
+                self.remote_engine_actors.append(actor)
+            refs.append(actor.wait_for_init.remote())
+
+        ray.get(refs)
+        self.run_refs = []
+        for actor in self.local_engine_actors + self.remote_engine_actors:
+            self.run_refs.append(actor.run.remote())
+
+    @staticmethod
+    def create_dp_placement_groups(
+            vllm_config: VllmConfig
+    ) -> tuple[list["PlacementGroup"], list[int]]:
+
+        import ray
+        from ray._private.state import available_resources_per_node
+        from ray.util.state import list_nodes
+
+        logger.info("Creating placement groups for data parallel")
+        dp_master_ip = \
+            vllm_config.parallel_config.data_parallel_master_ip
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        local_engine_count = \
+            vllm_config.parallel_config.data_parallel_size_local
+
+        nodes = list_nodes()
+        nodes = sorted(list_nodes(),
+                       key=lambda node: node.node_ip != dp_master_ip)
+        assert nodes[0].node_ip == dp_master_ip, (
+            "The first node must be the head node")
+        assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, (
+            "There can only be one head node")
+
+        available_resources = available_resources_per_node()
+        world_size = vllm_config.parallel_config.world_size
+        placement_groups: list[PlacementGroup] = []
+        local_dp_ranks: list[int] = []
+
+        for node in nodes:
+            node_ip = node.node_ip
+            node_resources = available_resources[node.node_id]
+            # For now, each DP rank can only be assigned to one node
+            # TODO(rui): support allocating a single DP rank
+            # to multiple nodes
+            available_engine_count = node_resources["GPU"] // world_size
+            if node_ip == dp_master_ip:
+                assert available_engine_count >= local_engine_count, (
+                    "Not enough resources to allocate DP ranks "
+                    f"on DP master node {node_ip}")
+                for i in range(local_engine_count):
+                    bundles = [{
+                        "GPU": 1.0,
+                        "node:" + dp_master_ip: 0.001
+                    }] * world_size + [{
+                        "CPU": 1.0
+                    }]
+                    pg = ray.util.placement_group(
+                        name=f"dp_rank_{len(placement_groups)}",
+                        strategy="STRICT_PACK",
+                        bundles=bundles,
+                    )
+                    placement_groups.append(pg)
+                    local_dp_ranks.append(i)
+            else:
+                for i in range(available_engine_count):
+                    if len(placement_groups) == dp_size:
+                        break
+                    bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}]
+                    pg = ray.util.placement_group(
+                        name=f"dp_rank_{len(placement_groups)}",
+                        strategy="STRICT_PACK",
+                        bundles=bundles,
+                    )
+                    placement_groups.append(pg)
+                    local_dp_ranks.append(i)
+        return placement_groups, local_dp_ranks
+
+    def get_run_refs(self):
+        return self.run_refs
 
-@dataclass
-class EngineHandshakeMetadata:
-    """Metadata sent to each engine process during startup handshake,
-    including addresses of the front-end ZMQ queues that they should
-    connect to.
-    """
-    addresses: EngineZmqAddresses
-    parallel_config: dict[str, Union[int, str]]
+    def close(self):
+        import ray
+        for actor in self.local_engine_actors + self.remote_engine_actors:
+            ray.kill(actor)
+        for pg in self.created_placement_groups:
+            ray.util.remove_placement_group(pg)
 
 
 def wait_for_engine_startup(
@@ -383,11 +549,19 @@ def wait_for_engine_startup(
 
 def wait_for_completion_or_failure(
         api_server_manager: APIServerProcessManager,
-        local_engine_manager: Optional[CoreEngineProcManager] = None,
+        engine_manager: Optional[Union[CoreEngineProcManager,
+                                       CoreEngineActorManager]] = None,
         coordinator: Optional["DPCoordinator"] = None) -> None:
     """Wait for all processes to complete or detect if any fail.
     
     Raises an exception if any process exits with a non-zero status.
+
+    Args:
+        api_server_manager: The manager for API servers.
+        engine_manager: The manager for engine processes.
+            If CoreEngineProcManager, it manages local engines;
+            if CoreEngineActorManager, it manages all engines.
+        coordinator: The coordinator for data parallel.
     """
 
     try:
@@ -402,14 +576,18 @@ def wait_for_completion_or_failure(
         if coordinator:
             sentinel_to_proc[coordinator.proc.sentinel] = coordinator.proc
 
-        if local_engine_manager:
-            for proc in local_engine_manager.processes:
+        actor_run_refs = []
+        if isinstance(engine_manager, CoreEngineProcManager):
+            for proc in engine_manager.processes:
                 sentinel_to_proc[proc.sentinel] = proc
+        elif isinstance(engine_manager, CoreEngineActorManager):
+            actor_run_refs = engine_manager.get_run_refs()
 
         # Check if any process terminates
-        while sentinel_to_proc:
+        while sentinel_to_proc or actor_run_refs:
             # Wait for any process to terminate
-            ready_sentinels: list[Any] = connection.wait(sentinel_to_proc)
+            ready_sentinels: list[Any] = connection.wait(sentinel_to_proc,
+                                                         timeout=5)
 
             # Process any terminated processes
             for sentinel in ready_sentinels:
@@ -420,6 +598,11 @@ def wait_for_completion_or_failure(
                     raise RuntimeError(
                         f"Process {proc.name} (PID: {proc.pid}) "
                         f"died with exit code {proc.exitcode}")
+
+            if actor_run_refs:
+                import ray
+                _, actor_run_refs = ray.wait(actor_run_refs, timeout=5)
+
     except KeyboardInterrupt:
         logger.info("Received KeyboardInterrupt, shutting down API servers...")
     except Exception as e:
@@ -431,8 +614,8 @@ def wait_for_completion_or_failure(
         api_server_manager.close()
         if coordinator:
             coordinator.close()
-        if local_engine_manager:
-            local_engine_manager.close()
+        if engine_manager:
+            engine_manager.close()
 
 
 # Note(rob): shutdown function cannot be a bound method,
-- 
GitLab


From 1282bd812ea4e1511378bad5b918d609280d2b89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= <wangzhipeng628@gmail.com>
Date: Tue, 3 Jun 2025 13:13:13 +0800
Subject: [PATCH 124/274] Add tarsier model support (#18985)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
---
 docs/models/supported_models.md               |   1 +
 examples/offline_inference/vision_language.py |  20 +
 .../vision_language_multi_image.py            |  21 +
 .../multimodal/processing/test_common.py      |   1 +
 tests/models/registry.py                      |   2 +
 vllm/model_executor/models/registry.py        |   1 +
 vllm/model_executor/models/tarsier.py         | 643 ++++++++++++++++++
 7 files changed, 689 insertions(+)
 create mode 100644 vllm/model_executor/models/tarsier.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index b60fefdda..f2090fe39 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -550,6 +550,7 @@ Specified using `--task generate`.
 | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni                                                             | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup>                   | `Qwen/Qwen2.5-Omni-7B`                                                                                                                                  |                      | ✅︎                         | ✅︎\*                     |
 | `SkyworkR1VChatModel`                        | Skywork-R1V-38B                                                          | T + I                                                                 | `Skywork/Skywork-R1V-38B`                                                                                                                               |                       | ✅︎                          | ✅︎                      |
 | `SmolVLMForConditionalGeneration`            | SmolVLM2                                                                 | T + I                                                                 | `SmolVLM2-2.2B-Instruct`                                                                                                                                | ✅︎                     |                           | ✅︎                      |
+| `TarsierForConditionalGeneration`                | Tarsier                                                                  | T + I<sup>E+</sup>                                                                                                     | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b`                                                                                                           |                      | ✅︎                      | ✅︎                   |
 
 <sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.  
 &nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:  
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index f05045016..2ef87f4f4 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -333,6 +333,25 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# omni-research/Tarsier-7b
+def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "omni-research/Tarsier-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={modality: 1},
+    )
+    prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # InternVL
 def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL3-2B"
@@ -1091,6 +1110,7 @@ model_example_map = {
     "qwen2_5_omni": run_qwen2_5_omni,
     "skywork_chat": run_skyworkr1v,
     "smolvlm": run_smolvlm,
+    "tarsier": run_tarsier,
 }
 
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index e776ff7fe..7ce28c5a4 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -691,6 +691,26 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "omni-research/Tarsier-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    prompt = f"USER: {'<image>' * len(image_urls)}\n{question}\n ASSISTANT:"
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 model_example_map = {
     "aria": load_aria,
     "aya_vision": load_aya_vision,
@@ -712,6 +732,7 @@ model_example_map = {
     "qwen2_vl": load_qwen2_vl,
     "qwen2_5_vl": load_qwen2_5_vl,
     "smolvlm": load_smolvlm,
+    "tarsier": load_tarsier,
 }
 
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index d7f950c23..2377fef82 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -282,6 +282,7 @@ def _test_processing_correctness_one(
     "Skywork/Skywork-R1V-38B",
     "fixie-ai/ultravox-v0_5-llama-3_2-1b",
     "openai/whisper-large-v3",
+    "omni-research/Tarsier-7b",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index fe49d2427..182a9668e 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -406,6 +406,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                      trust_remote_code=True),
+    "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b",  # noqa: E501
+                                                        hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}),  # noqa: E501
     # [Encoder-decoder]
     # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
     # Therefore, we borrow the BartTokenizer from the original Bart model
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 8efd4825b..fcef457a7 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -211,6 +211,7 @@ _MULTIMODAL_MODELS = {
     "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
+    "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"),  # noqa: E501
     # [Encoder-decoder]
     "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
     "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
new file mode 100644
index 000000000..5aa3ddabc
--- /dev/null
+++ b/vllm/model_executor/models/tarsier.py
@@ -0,0 +1,643 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
+                    Union, cast)
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature, CLIPVisionConfig
+from transformers import LlavaConfig as HfLlavaConfig
+from transformers import PretrainedConfig, SiglipVisionConfig
+from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
+from transformers.models.llava import LlavaProcessor
+from transformers.processing_utils import (ProcessingKwargs, Unpack,
+                                           _validate_images_text_input_order)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+from vllm.config import VllmConfig
+from vllm.inputs import InputProcessingContext
+from vllm.jsontree import json_map_leaves
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.models.llava import LlavaDummyInputsBuilder
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, ProcessingCache,
+                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .clip import CLIPVisionModel
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+from .vision import VisionEncoderInfo, get_vision_encoder_info
+
+
+class TarsierImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+
+
+class TarsierImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+
+
+TarsierImageInputs = Union[TarsierImagePixelInputs,
+                           TarsierImageEmbeddingInputs]
+
+
+class TarsierHfConfig(Protocol):  # Based on the Tarsier's LlavaConfig
+    vision_config: Final[PretrainedConfig]
+    text_config: Final[PretrainedConfig]  # Added from Tarsier's LlavaConfig
+    image_token_index: Final[int]
+    vision_feature_select_strategy: Final[str]
+    vision_feature_layer: Final[Union[int, list[int]]]
+    projector_hidden_act: Final[str]
+    image_newline_idx: Final[int]
+    image_new_idx: Final[int]
+    multimodal_projector_bias: bool = True
+
+
+class TarsierProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {},
+    }
+
+
+class TarsierProcessor(LlavaProcessor):
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput],
+                    list[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[TarsierProcessorKwargs],
+    ) -> BatchFeature:
+        if images is None and text is None:
+            raise ValueError(
+                "You have to specify at least one of `images` or `text`.")
+
+        # check if images and text inputs are reversed for BC
+        images, text = _validate_images_text_input_order(images, text)
+
+        output_kwargs = self._merge_kwargs(
+            TarsierProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(
+                images, **output_kwargs["images_kwargs"])
+        else:
+            image_inputs = {}
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string,"
+                             " or a list of strings")
+
+        # try to expand inputs in processing if we have the necessary parts
+        prompt_strings = text
+        if image_inputs.get("pixel_values") is not None:
+            # Replace the image token with the expanded image token sequence
+            pixel_values = image_inputs["pixel_values"]
+            height, width = get_image_size(to_numpy_array(pixel_values[0]))
+            num_image_tokens = (height // self.patch_size) * (
+                width // self.patch_size +
+                1) + self.num_additional_image_tokens + 1
+            if self.vision_feature_select_strategy == "default":
+                num_image_tokens -= 1
+
+            prompt_strings = []
+            for sample in text:
+                sample = sample.replace(self.image_token,
+                                        self.image_token * num_image_tokens)
+                prompt_strings.append(sample)
+
+        return_tensors = output_kwargs["text_kwargs"].pop(
+            "return_tensors", None)
+        text_inputs = self.tokenizer(prompt_strings,
+                                     **output_kwargs["text_kwargs"])
+        return BatchFeature(data={
+            **text_inputs,
+            **image_inputs
+        },
+                            tensor_type=return_tensors)
+
+
+class TarsierMultiModalProjector(nn.Module):
+
+    def __init__(self,
+                 vision_hidden_size: int,
+                 text_hidden_size: int,
+                 projector_hidden_act: str,
+                 multimodal_projector_bias: bool,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
+                                             text_hidden_size,
+                                             bias=multimodal_projector_bias,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.linear_1")
+        self.act = get_act_fn(projector_hidden_act)
+        self.linear_2 = RowParallelLinear(text_hidden_size,
+                                          text_hidden_size,
+                                          bias=multimodal_projector_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.linear_2")
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class TarsierProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> TarsierHfConfig:
+        return self.ctx.get_hf_config(HfLlavaConfig)
+
+    def get_vision_encoder_info(self) -> VisionEncoderInfo:
+        return get_vision_encoder_info(self.get_hf_config())
+
+    def get_hf_processor(self, **kwargs: object) -> TarsierProcessor:
+        hf_processor = self.ctx.get_hf_processor(TarsierProcessor, **kwargs)
+        # Patch for patch_size if needed (copied from vLLM LLaVA)
+        if hasattr(hf_processor,
+                   'patch_size') and hf_processor.patch_size is None:
+            patch_size = self.get_vision_encoder_info().get_patch_size()
+            hf_processor.patch_size = patch_size
+        return hf_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def _apply_feature_select_strategy(
+        self,
+        strategy: str,
+        encoder_num_image_tokens: int,
+    ) -> int:
+        if strategy == "default":
+            return encoder_num_image_tokens - 1
+        if strategy == "full":
+            return encoder_num_image_tokens
+        msg = f"Unexpected feature select strategy: {strategy!r}"
+        raise NotImplementedError(msg)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_encoder_info = self.get_vision_encoder_info()
+        num_projected_patches = self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
+        )
+        if num_projected_patches <= 0:
+            default_size = self.get_image_size_with_most_features()
+            num_projected_patches_default = self._apply_feature_select_strategy(
+                hf_config.vision_feature_select_strategy,
+                vision_encoder_info.get_num_image_tokens(
+                    image_width=default_size.width,
+                    image_height=default_size.height,
+                ),
+            )
+            if num_projected_patches_default <= 0:
+                raise ValueError(
+                    "Could not determine a valid number of image patches.")
+            num_projected_patches = num_projected_patches_default
+        num_height_patches = int(math.sqrt(num_projected_patches))
+        total_image_tokens_for_llm = num_projected_patches \
+            + num_height_patches + 1
+        return total_image_tokens_for_llm
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
+    def get_image_newline_idx(self) -> int:
+        return self.get_hf_config().image_newline_idx
+
+    def get_image_new_idx(self) -> int:
+        return self.get_hf_config().image_new_idx
+
+
+_I_Tarsier = TypeVar("_I_Tarsier", bound=TarsierProcessingInfo)
+
+
+class TarsierDummyInputsBuilder(LlavaDummyInputsBuilder[_I_Tarsier]):
+
+    pass
+
+
+class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_config = self.info.get_hf_config()
+        image_token_id = hf_config.image_token_index  # The <IMAGE> token ID
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_projected_patches = images.get_feature_size(item_idx)
+                # This assumes num_projected_patches is a perfect square
+                num_height_patches = int(math.sqrt(num_projected_patches))
+                num_final_image_tokens = num_projected_patches \
+                + num_height_patches + 1
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_final_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+            return [image_token_id] * num_final_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],  # Replace each single <IMAGE> token
+                replacement=get_replacement,
+            ),
+        ]
+
+
+def _build_tarsier_hf_info(
+        ctx: InputProcessingContext) -> TarsierProcessingInfo:
+    return TarsierProcessingInfo(ctx)
+
+
+def _build_tarsier_hf_processor(
+    info: _I_Tarsier,
+    dummy_inputs: BaseDummyInputsBuilder[_I_Tarsier],
+    *,
+    cache: Optional[ProcessingCache] = None,
+) -> BaseMultiModalProcessor:
+    if isinstance(info, TarsierProcessingInfo):
+        return TarsierMultiModalProcessor(
+            info,
+            dummy_inputs,
+            cache=cache,
+        )
+    raise NotImplementedError(type(info))
+
+
+def init_vision_tower_for_tarsier(
+    hf_config: TarsierHfConfig,  # Use the Tarsier specific config protocol
+    quant_config: Optional[QuantizationConfig],
+    *,
+    require_post_norm: Optional[bool] = None,
+    prefix: str = "",
+) -> Union[CLIPVisionModel, SiglipVisionModel]:
+    vision_config = hf_config.vision_config
+
+    feature_layers = hf_config.vision_feature_layer
+    base_num_hidden_layers = vision_config.num_hidden_layers
+
+    def _get_layer_index(feature_layer_index: int,
+                         num_hidden_layers_total: int) -> int:
+        if feature_layer_index < 0:
+            return num_hidden_layers_total + feature_layer_index + 1
+        return feature_layer_index
+
+    if isinstance(feature_layers, int):
+        num_hidden_layers_to_init = _get_layer_index(feature_layers,
+                                                     base_num_hidden_layers)
+    elif isinstance(feature_layers, (list, tuple)):
+        num_hidden_layers_to_init = max(
+            _get_layer_index(idx, base_num_hidden_layers)
+            for idx in feature_layers)
+    else:
+        raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
+                        " is not supported")
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return CLIPVisionModel(
+            vision_config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_to_init,
+            require_post_norm=require_post_norm,
+            prefix=prefix,
+        )
+    elif isinstance(vision_config, SiglipVisionConfig):
+        return SiglipVisionModel(
+            vision_config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_to_init,
+            require_post_norm=require_post_norm,
+            prefix=prefix,
+        )
+
+    msg = f"Unsupported vision config for Tarsier: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+@MULTIMODAL_REGISTRY.register_processor(_build_tarsier_hf_processor,
+                                        info=_build_tarsier_hf_info,
+                                        dummy_inputs=TarsierDummyInputsBuilder)
+class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                      SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config: TarsierHfConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config  # Storing the Tarsier-specific HF config
+        self.vision_tower = init_vision_tower_for_tarsier(
+            config,
+            quant_config,
+            require_post_norm=False,
+            prefix=maybe_prefix(prefix, "vision_tower"))
+        projector_bias = getattr(config, "multimodal_projector_bias", True)
+
+        self.multi_modal_projector = TarsierMultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act,
+            multimodal_projector_bias=projector_bias,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.
+            text_config,  # Use text_config from Tarsier's main config
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+        self.register_buffer('image_newline_idx_tensor',
+                             torch.tensor([config.image_newline_idx],
+                                          dtype=torch.long),
+                             persistent=False)
+        self.register_buffer('image_new_idx_tensor',
+                             torch.tensor([config.image_new_idx],
+                                          dtype=torch.long),
+                             persistent=False)
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)  # Assuming 3 channels
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[TarsierImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return TarsierImagePixelInputs(
+                type="pixel_values",
+                pixel_values=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True)),
+            )
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+            return TarsierImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds, concat=True),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _image_pixels_to_features(
+        self,
+        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        pixel_values: Union[torch.Tensor, list[torch.Tensor]],
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        # From vLLM LLaVA, vision tower output handling
+        image_hidden_states = vision_tower(pixel_values)
+        if not isinstance(image_hidden_states, torch.Tensor):
+            raise TypeError(
+                f"image_hidden_states type: {type(image_hidden_states)}"
+                " is not supported")
+
+        def select_features_fn(leaf: torch.Tensor):
+            return self._select_image_features(
+                leaf,
+                strategy=self.config.vision_feature_select_strategy,
+            )
+
+        selected_features = cast(
+            Union[torch.Tensor, tuple[torch.Tensor, ...]],
+            json_map_leaves(select_features_fn, image_hidden_states),
+        )
+        return selected_features
+
+    def _add_tarsier_split_tokens(
+            self, projected_image_features: torch.Tensor) -> torch.Tensor:
+        """
+        Implements Tarsier's `add_split_tokens` logic.
+        """
+        num_images, num_projected_patches, embed_dim = \
+            projected_image_features.shape
+        num_height_patches = int(math.sqrt(num_projected_patches))
+        num_width_patches = num_projected_patches // num_height_patches
+        device = projected_image_features.device
+        embedding_layer = self.language_model.model.embed_tokens
+        image_newline_emb = embedding_layer(
+            self.image_newline_idx_tensor.to(device)).squeeze(0)
+        image_new_emb = embedding_layer(
+            self.image_new_idx_tensor.to(device)).squeeze(0)
+        try:
+            current_image_features_grid = projected_image_features.view(
+                num_images, num_height_patches, num_width_patches, embed_dim)
+        except RuntimeError as e:
+            raise RuntimeError(
+                "Cannot reshape projected_image_features"
+                f" with shape {projected_image_features.shape} "
+                f"to ({num_images}, {num_height_patches},"
+                f" {num_width_patches}, {embed_dim}). "
+                "Ensure num_projected_patches is compatible"
+                " with a grid structure. "
+                f"num_projected_patches={num_projected_patches}, "
+                f"derived num_height_patches={num_height_patches}. ") from e
+
+        image_newline_expanded = image_newline_emb.expand(
+            (num_images, num_height_patches, 1, embed_dim))
+        features_with_newlines = torch.cat(
+            [current_image_features_grid, image_newline_expanded],
+            dim=2  # Concatenate along width dim
+        )
+        new_num_patches_after_newline = num_projected_patches \
+            + num_height_patches
+        features_with_newlines_flat = features_with_newlines.view(
+            num_images, new_num_patches_after_newline, embed_dim)
+        image_new_expanded = image_new_emb.expand((num_images, 1, embed_dim))
+        final_image_features = torch.cat(
+            [features_with_newlines_flat, image_new_expanded],
+            dim=1  # Concatenate along patch sequence dim
+        )
+        return final_image_features
+
+    def _process_image_pixels(
+        self,
+        inputs: TarsierImagePixelInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        assert self.vision_tower is not None
+        pixel_values = inputs["pixel_values"]
+        image_features_selected = self._image_pixels_to_features(
+            self.vision_tower, pixel_values)  # type: ignore
+        if isinstance(image_features_selected, torch.Tensor):
+            projected_features = self.multi_modal_projector(
+                image_features_selected)
+            final_features = self._add_tarsier_split_tokens(projected_features)
+            return final_features
+        else:
+            raise TypeError(
+                f"_image_pixels_to_features type:"
+                f" {type(image_features_selected)} is not supported")
+
+    def _process_image_input(
+        self,
+        image_input: TarsierImageInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            projected_features = image_input["data"]
+            if isinstance(projected_features, torch.Tensor):
+                return self._add_tarsier_split_tokens(projected_features)
+            else:
+                raise ValueError("Incorrect type of image_embeds. "
+                                 f"Got type: {type(projected_features)}. ")
+        assert self.vision_tower is not None
+        return self._process_image_pixels(image_input)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        return self._process_image_input(image_input)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.image_token_index,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
-- 
GitLab


From 17430e36531aeade52518b13961706d4227310f9 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Tue, 3 Jun 2025 13:35:12 +0800
Subject: [PATCH 125/274] [bugfix] small fix logic issue (#18999)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a5b155024..e3b8a18cc 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -455,7 +455,7 @@ class EngineArgs:
             title="ModelConfig",
             description=ModelConfig.__doc__,
         )
-        if 'serve' not in sys.argv[1:] and '--help' not in sys.argv[1:]:
+        if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]):
             model_group.add_argument("--model", **model_kwargs["model"])
         model_group.add_argument("--task", **model_kwargs["task"])
         model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
-- 
GitLab


From cc977286e7a4350183aeef873858fe0dc6774740 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 3 Jun 2025 02:00:45 -0400
Subject: [PATCH 126/274] Reduce logs in CLI scripts and plugin loader (#18970)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/benchmarks/datasets.py   |  6 +++---
 vllm/benchmarks/latency.py    |  2 --
 vllm/benchmarks/throughput.py |  1 -
 vllm/compilation/backends.py  |  6 +++---
 vllm/plugins/__init__.py      | 19 +++++++++++++------
 5 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 35cc303f6..21fe3eb62 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -328,9 +328,9 @@ class RandomDataset(BenchmarkDataset):
         output_high = int(output_len * (1 + range_ratio))
 
         # Add logging for debugging
-        logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
-        logger.info("Sampling output_len from [%s, %s]", output_low,
-                    output_high)
+        logger.info(
+            "Sampling input_len from [%s, %s] and output_len from [%s, %s]",
+            input_low, input_high, output_low, output_high)
 
         input_lens = np.random.randint(input_low,
                                        input_high + 1,
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index c9e03cc3b..dc1c42879 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -78,7 +78,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
 
 
 def main(args: argparse.Namespace):
-    print(args)
     if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
         raise OSError(
             "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
@@ -101,7 +100,6 @@ def main(args: argparse.Namespace):
         max_tokens=args.output_len,
         detokenize=not args.disable_detokenize,
     )
-    print(sampling_params)
     dummy_prompt_token_ids = np.random.randint(10000,
                                                size=(args.batch_size,
                                                      args.input_len))
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index 13110a8b4..3ea6c194b 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -527,7 +527,6 @@ def main(args: argparse.Namespace):
     validate_args(args)
     if args.seed is None:
         args.seed = 0
-    print(args)
     random.seed(args.seed)
     # Sample the requests.
     tokenizer = AutoTokenizer.from_pretrained(
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index b724479a9..c4bfffe92 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -31,13 +31,13 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
     if compilation_config.use_inductor:
         if envs.VLLM_USE_STANDALONE_COMPILE and is_torch_equal_or_newer(
                 "2.8.0"):
-            logger.info("Using InductorStandaloneAdaptor")
+            logger.debug("Using InductorStandaloneAdaptor")
             return InductorStandaloneAdaptor()
         else:
-            logger.info("Using InductorAdaptor")
+            logger.debug("Using InductorAdaptor")
             return InductorAdaptor()
     else:
-        logger.info("Using EagerAdaptor")
+        logger.debug("Using EagerAdaptor")
         return EagerAdaptor()
 
 
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 2884cb46f..4cd3552f8 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -10,6 +10,8 @@ import vllm.envs as envs
 
 logger = logging.getLogger(__name__)
 
+DEFAULT_PLUGINS_GROUP = 'vllm.general_plugins'
+
 # make sure one process only loads plugins once
 plugins_loaded = False
 
@@ -28,19 +30,24 @@ def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]:
         logger.debug("No plugins for group %s found.", group)
         return {}
 
-    logger.info("Available plugins for group %s:", group)
+    # Check if the only discovered plugin is the default one
+    is_default_group = (group == DEFAULT_PLUGINS_GROUP)
+    # Use INFO for non-default groups and DEBUG for the default group
+    log_level = logger.debug if is_default_group else logger.info
+
+    log_level("Available plugins for group %s:", group)
     for plugin in discovered_plugins:
-        logger.info("- %s -> %s", plugin.name, plugin.value)
+        log_level("- %s -> %s", plugin.name, plugin.value)
 
     if allowed_plugins is None:
-        logger.info("All plugins in this group will be loaded. "
-                    "Set `VLLM_PLUGINS` to control which plugins to load.")
+        log_level("All plugins in this group will be loaded. "
+                  "Set `VLLM_PLUGINS` to control which plugins to load.")
 
     plugins = dict[str, Callable[[], Any]]()
     for plugin in discovered_plugins:
         if allowed_plugins is None or plugin.name in allowed_plugins:
             if allowed_plugins is not None:
-                logger.info("Loading plugin %s", plugin.name)
+                log_level("Loading plugin %s", plugin.name)
 
             try:
                 func = plugin.load()
@@ -80,7 +87,7 @@ def load_general_plugins():
             # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
             os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
 
-    plugins = load_plugins_by_group(group='vllm.general_plugins')
+    plugins = load_plugins_by_group(group=DEFAULT_PLUGINS_GROUP)
     # general plugins, we only need to execute the loaded functions
     for func in plugins.values():
         func()
-- 
GitLab


From d32aa2e67002afe936b8d2cadffd8adc7aaf48e7 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Tue, 3 Jun 2025 15:16:17 +0800
Subject: [PATCH 127/274] [Bugfix] Use cmake 3.26.1 instead of 3.26 to avoid
 build failure (#19019)

Signed-off-by: Lu Fang <lufang@fb.com>
---
 docker/Dockerfile.neuron                           | 2 +-
 docs/getting_started/installation/cpu/build.inc.md | 2 +-
 pyproject.toml                                     | 2 +-
 requirements/build.txt                             | 2 +-
 requirements/rocm-build.txt                        | 2 +-
 requirements/tpu.txt                               | 2 +-
 requirements/xpu.txt                               | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docker/Dockerfile.neuron b/docker/Dockerfile.neuron
index 259dc5a23..8bc235547 100644
--- a/docker/Dockerfile.neuron
+++ b/docker/Dockerfile.neuron
@@ -34,7 +34,7 @@ RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
 RUN python3 -m pip install -U \
-        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         -r requirements/neuron.txt
 
 ENV VLLM_TARGET_DEVICE neuron
diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md
index 7d6472afa..7ddadccb1 100644
--- a/docs/getting_started/installation/cpu/build.inc.md
+++ b/docs/getting_started/installation/cpu/build.inc.md
@@ -17,7 +17,7 @@ Third, install Python packages for vLLM CPU backend building:
 
 ```console
 pip install --upgrade pip
-pip install "cmake>=3.26" wheel packaging ninja "setuptools-scm>=8" numpy
+pip install "cmake>=3.26.1" wheel packaging ninja "setuptools-scm>=8" numpy
 pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 ```
 
diff --git a/pyproject.toml b/pyproject.toml
index 10f5dbeae..307878f7e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [build-system]
 # Should be mirrored in requirements/build.txt
 requires = [
-    "cmake>=3.26",
+    "cmake>=3.26.1",
     "ninja",
     "packaging>=24.2",
     "setuptools>=77.0.3,<80.0.0",
diff --git a/requirements/build.txt b/requirements/build.txt
index 320e5b892..528cd3b53 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -1,5 +1,5 @@
 # Should be mirrored in pyproject.toml
-cmake>=3.26
+cmake>=3.26.1
 ninja
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 981b90632..94201543c 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -7,7 +7,7 @@ torchvision==0.22.0
 torchaudio==2.7.0
 
 triton==3.2
-cmake>=3.26,<4
+cmake>=3.26.1,<4
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index edc8b2a45..47e638463 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -2,7 +2,7 @@
 -r common.txt
 
 # Dependencies for TPU
-cmake>=3.26
+cmake>=3.26.1
 packaging>=24.2
 setuptools-scm>=8
 wheel
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index 04c4d4ff8..3cb6a4a8a 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -2,7 +2,7 @@
 -r common.txt
 
 ray>=2.9
-cmake>=3.26
+cmake>=3.26.1
 packaging>=24.2
 setuptools-scm>=8
 setuptools>=77.0.3,<80.0.0
-- 
GitLab


From f32fcd944430603ebcbbf04454b4e15754168ef4 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 3 Jun 2025 16:01:48 +0800
Subject: [PATCH 128/274] [v1][KVCacheManager] Rename BlockHashType to
 BlockHash (#19015)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 docs/design/v1/prefix_caching.md             |  2 +-
 tests/v1/core/test_kv_cache_utils.py         | 12 +++++-------
 tests/v1/core/test_prefix_caching.py         |  4 ++--
 tests/v1/core/test_specialized_manager.py    |  4 ++--
 vllm/v1/core/block_pool.py                   |  8 ++++----
 vllm/v1/core/kv_cache_manager.py             |  4 ++--
 vllm/v1/core/kv_cache_utils.py               | 14 +++++++-------
 vllm/v1/core/single_type_kv_cache_manager.py | 10 +++++-----
 8 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/docs/design/v1/prefix_caching.md b/docs/design/v1/prefix_caching.md
index ad041b005..bbdfb2552 100644
--- a/docs/design/v1/prefix_caching.md
+++ b/docs/design/v1/prefix_caching.md
@@ -104,7 +104,7 @@ class KVCacheBlock:
     block_id: int
     # The block hash (will be assigned when the block is full,
     # and will be reset when the block is evicted).
-    block_hash: BlockHashType
+    block_hash: BlockHash
     # The number of requests using this block now.
     ref_cnt: int
 
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index d3d62cf09..61aee8752 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -100,8 +100,8 @@ def test_kv_cache_block():
     assert block.ref_cnt == 0
 
     # Test block hash setting and resetting
-    block_hash = vllm.v1.core.kv_cache_utils.BlockHashType(hash_value=123,
-                                                           token_ids=(1, 2, 3))
+    block_hash = vllm.v1.core.kv_cache_utils.BlockHash(hash_value=123,
+                                                       token_ids=(1, 2, 3))
     block.block_hash = block_hash
     assert block.block_hash == block_hash
 
@@ -282,7 +282,7 @@ def test_hash_block_tokens(hash_fn):
 
     block_hash = hash_block_tokens(hash_fn, parent_block_hash,
                                    curr_block_token_ids, extra_keys)
-    assert isinstance(block_hash, vllm.v1.core.kv_cache_utils.BlockHashType)
+    assert isinstance(block_hash, vllm.v1.core.kv_cache_utils.BlockHash)
     assert block_hash.hash_value == hash_fn(
         (parent_block_hash, curr_block_token_ids, extra_keys))
     assert block_hash.token_ids == curr_block_token_ids
@@ -306,10 +306,8 @@ def test_hash_request_tokens(hash_fn):
     block_hashes = hash_request_tokens(hash_fn, block_size, request)
 
     assert len(block_hashes) == 2
-    assert isinstance(block_hashes[0],
-                      vllm.v1.core.kv_cache_utils.BlockHashType)
-    assert isinstance(block_hashes[1],
-                      vllm.v1.core.kv_cache_utils.BlockHashType)
+    assert isinstance(block_hashes[0], vllm.v1.core.kv_cache_utils.BlockHash)
+    assert isinstance(block_hashes[1], vllm.v1.core.kv_cache_utils.BlockHash)
 
     # Check the first block
     assert block_hashes[0].token_ids == (0, 1, 2)
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index ba3c0b3cf..1a7a31d98 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -12,7 +12,7 @@ from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
-from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
+from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
                                          hash_block_tokens)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, SlidingWindowSpec)
@@ -547,7 +547,7 @@ def test_cache_blocks(hash_fn):
 
     # Test that blocks are cached correctly for 2 full blocks from the start.
     blocks = [KVCacheBlock(block_id=i) for i in range(2)]
-    block_hashes: list[BlockHashType] = []
+    block_hashes: list[BlockHash] = []
 
     block_pool.cache_full_blocks(
         request=req,
diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py
index 101a2379b..4217dc37e 100644
--- a/tests/v1/core/test_specialized_manager.py
+++ b/tests/v1/core/test_specialized_manager.py
@@ -3,7 +3,7 @@
 import torch
 
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock
+from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
 from vllm.v1.core.single_type_kv_cache_manager import SlidingWindowManager
 from vllm.v1.kv_cache_interface import SlidingWindowSpec
 
@@ -32,7 +32,7 @@ def test_sliding_window_possible_cached_prefix():
 
     def run_one_case(block_is_cached, expect_length):
         block_hash_list = [
-            BlockHashType(i, ()) for i in range(len(block_is_cached))
+            BlockHash(i, ()) for i in range(len(block_is_cached))
         ]
 
         block_pool.cached_block_hash_to_block.clear()
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index f2ed183b6..a0a065df9 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -6,7 +6,7 @@ from typing import Callable, Optional
 from vllm.distributed.kv_events import (AllBlocksCleared, BlockRemoved,
                                         BlockStored, KVCacheEvent)
 from vllm.logger import init_logger
-from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
+from vllm.v1.core.kv_cache_utils import (BlockHash, FreeKVCacheBlockQueue,
                                          KVCacheBlock,
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens)
@@ -55,7 +55,7 @@ class BlockPool:
         # if there is already an identical block in the cache. This is because
         # we want to make sure the allocated block IDs won't change so that
         # block tables are append-only.
-        self.cached_block_hash_to_block: dict[BlockHashType, dict[
+        self.cached_block_hash_to_block: dict[BlockHash, dict[
             int, KVCacheBlock]] = defaultdict(dict)
 
         # To represent a placeholder block with block_id=0.
@@ -67,7 +67,7 @@ class BlockPool:
         self.kv_event_queue: list[KVCacheEvent] = []
 
     def get_cached_block(self,
-                         block_hash: BlockHashType) -> Optional[KVCacheBlock]:
+                         block_hash: BlockHash) -> Optional[KVCacheBlock]:
         """Get a cached block by the block hash, or None if cache miss.
         If there are duplicated blocks, we return the first block in the cache.
 
@@ -87,7 +87,7 @@ class BlockPool:
         self,
         request: Request,
         blocks: list[KVCacheBlock],
-        block_hashes: list[BlockHashType],
+        block_hashes: list[BlockHash],
         num_cached_blocks: int,
         num_full_blocks: int,
         block_size: int,
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 0f6098d2b..59e07382b 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -8,7 +8,7 @@ from vllm.distributed.kv_events import KVCacheEvent
 from vllm.logger import init_logger
 from vllm.utils import sha256
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
+from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
                                          hash_request_tokens)
 from vllm.v1.core.single_type_kv_cache_manager import (
     get_manager_for_kv_cache_spec)
@@ -92,7 +92,7 @@ class KVCacheManager:
         # This is to avoid recomputing the block hashes for each call of
         # `get_computed_blocks` or `allocate_slots`.
         self.req_to_block_hashes: defaultdict[
-            str, list[BlockHashType]] = defaultdict(list)
+            str, list[BlockHash]] = defaultdict(list)
 
     @property
     def usage(self) -> float:
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index a41fe4881..3ccad97e9 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -18,7 +18,7 @@ from vllm.v1.request import Request
 logger = init_logger(__name__)
 
 
-class BlockHashType(NamedTuple):
+class BlockHash(NamedTuple):
     """Hash value of a block (int), the token IDs in the block, and extra keys.
     We keep a tuple of token IDs and extra keys to reduce the likelihood of
     hash collisions when the hash value is the same. By using SHA256 however,
@@ -117,7 +117,7 @@ class KVCacheBlock:
     ref_cnt: int = 0
     # The hash of the block composed of (block hash, tuple of token IDs).
     # It is only available when the block is full.
-    _block_hash: Optional[BlockHashType] = None
+    _block_hash: Optional[BlockHash] = None
 
     # Used to construct a doubly linked list for free blocks.
     # These two attributes should only be manipulated by FreeKVCacheBlockQueue.
@@ -131,11 +131,11 @@ class KVCacheBlock:
         self.ref_cnt -= 1
 
     @property
-    def block_hash(self) -> Optional[BlockHashType]:
+    def block_hash(self) -> Optional[BlockHash]:
         return self._block_hash
 
     @block_hash.setter
-    def block_hash(self, block_hash: BlockHashType):
+    def block_hash(self, block_hash: BlockHash):
         assert self.block_hash is None, (
             "The block already has a hash. This should not happen.")
         self._block_hash = block_hash
@@ -398,7 +398,7 @@ def hash_block_tokens(
         hash_function: Callable,
         parent_block_hash: Optional[int],
         curr_block_token_ids: Sequence[int],
-        extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHashType:
+        extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHash:
     """Computes a hash value corresponding to the contents of a block and
     the contents of the preceding block(s). The hash value is used for
     prefix caching. We use LRU cache for this function to avoid recomputing
@@ -419,14 +419,14 @@ def hash_block_tokens(
         parent_block_hash = NONE_HASH
 
     curr_block_token_ids_tuple = tuple(curr_block_token_ids)
-    return BlockHashType(
+    return BlockHash(
         hash_function(
             (parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
         curr_block_token_ids_tuple, extra_keys)
 
 
 def hash_request_tokens(hash_function: Any, block_size: int,
-                        request: Request) -> list[BlockHashType]:
+                        request: Request) -> list[BlockHash]:
     """Computes hash values of a chain of blocks given a sequence of
     token IDs. The hash value is used for prefix caching.
 
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index 0223c9cee..e69e9ac9f 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -5,7 +5,7 @@ from typing import Callable
 
 from vllm.utils import cdiv
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock
+from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheSpec,
                                         SlidingWindowSpec)
 from vllm.v1.request import Request
@@ -133,7 +133,7 @@ class SingleTypeKVCacheManager(ABC):
             req_blocks.extend(new_blocks)
             return new_blocks
 
-    def cache_blocks(self, request: Request, block_hashes: list[BlockHashType],
+    def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
                      num_tokens: int) -> None:
         """
         Cache the blocks for the request.
@@ -187,7 +187,7 @@ class SingleTypeKVCacheManager(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def find_longest_cache_hit(self, block_hashes: list[BlockHashType],
+    def find_longest_cache_hit(self, block_hashes: list[BlockHash],
                                max_length: int) -> list[KVCacheBlock]:
         """
         Get the longest cache hit prefix of the blocks that is not longer than 
@@ -228,7 +228,7 @@ class SingleTypeKVCacheManager(ABC):
 
 class FullAttentionManager(SingleTypeKVCacheManager):
 
-    def find_longest_cache_hit(self, block_hashes: list[BlockHashType],
+    def find_longest_cache_hit(self, block_hashes: list[BlockHash],
                                max_length: int) -> list[KVCacheBlock]:
         computed_blocks: list[KVCacheBlock] = []
         max_num_blocks = max_length // self.block_size
@@ -280,7 +280,7 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
             self.sliding_window_contiguous_blocks += 1
         self._null_block = block_pool.null_block
 
-    def find_longest_cache_hit(self, block_hashes: list[BlockHashType],
+    def find_longest_cache_hit(self, block_hashes: list[BlockHash],
                                max_length: int) -> list[KVCacheBlock]:
         # TODO: reduce i by sliding_window_contiguous_blocks when cache miss, to
         # optimize the time complexity from O(max_num_blocks) to
-- 
GitLab


From 6d18ed2a2e858a8061dfe8c2e140c2c498d6a99a Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 3 Jun 2025 04:21:53 -0400
Subject: [PATCH 129/274] Update docker docs with ARM CUDA cross-compile
 (#19037)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 docs/deployment/docker.md | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index 516640f6f..9e506d3d7 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -107,10 +107,21 @@ DOCKER_BUILDKIT=1 docker build . \
   -t vllm/vllm-gh200-openai:latest \
   --build-arg max_jobs=66 \
   --build-arg nvcc_threads=2 \
-  --build-arg torch_cuda_arch_list="9.0+PTX" \
+  --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
   --build-arg vllm_fa_cmake_gpu_arches="90-real"
 ```
 
+!!! note
+    If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
+
+    Run the following command on your host machine to register QEMU user static handlers:
+
+    ```console
+    docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+    ```
+
+    After setting up QEMU, you can use the `--platform "linux/arm64"` flag in your `docker build` command.
+
 ## Use the custom-built vLLM Docker image
 
 To run vLLM with the custom-built Docker image:
-- 
GitLab


From 42243fbda04d908aa16f17bf3d5f9cf35e4ef26f Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 3 Jun 2025 17:08:03 +0800
Subject: [PATCH 130/274] [Doc] Add InternVL LoRA support  (#19055)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index f2090fe39..71414d2aa 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -524,7 +524,7 @@ Specified using `--task generate`.
 | `GraniteSpeechForConditionalGeneration`      | Granite Speech                                                           | T + A                                                                 | `ibm-granite/granite-speech-3.3-8b`                                                                                                                     | ✅︎                     | ✅︎                          | ✅︎                    |
 | `H2OVLChatModel`                             | H2OVL                                                                    | T + I<sup>E+</sup>                                                    | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.                                                                                      |                       | ✅︎                          | ✅︎\*                     |
 | `Idefics3ForConditionalGeneration`           | Idefics3                                                                 | T + I                                                                 | `HuggingFaceM4/Idefics3-8B-Llama3` etc.                                                                                                                 | ✅︎                     |                           |  ✅︎                     |
-| `InternVLChatModel`                          | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>)                                                    | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.  |                      | ✅︎                          | ✅︎                      |
+| `InternVLChatModel`                          | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>)                                                    | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.  | ✅︎               | ✅︎                          | ✅︎                      |
 | `KimiVLForConditionalGeneration`             | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking                               | T + I<sup>+</sup>                                                     | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking`                                                                                    |                      |                             | ✅︎                       |
 | `Llama4ForConditionalGeneration`             | Llama 4                                                                  | T + I<sup>+</sup>                                                     | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. |                       | ✅︎                          | ✅︎                      |
 | `LlavaForConditionalGeneration`              | LLaVA-1.5                                                                | T + I<sup>E+</sup>                                                    | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.                                                                        |                       | ✅︎                          | ✅︎                      |
-- 
GitLab


From ec2dcd80bc173c06a4c48377d4a6b6ca2c78a2f5 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 3 Jun 2025 17:08:20 +0800
Subject: [PATCH 131/274] [Misc] Update `WeightsMapper` for qwen2-vl/qwen2.5-vl
 (#19054)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 13 +++++++++----
 vllm/model_executor/models/qwen2_vl.py   | 13 +++++++++----
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index e3fa9f67c..f62c7e1d2 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -823,10 +823,15 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsLoRA, SupportsPP):
 
     # To ensure correct weight loading and mapping.
-    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
-        "lm_head.": "language_model.lm_head.",
-        "model.": "language_model.model.",
-    })
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.visual.": "visual.",
+            # mapping for original checkpoint
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        })
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 873baa56f..5c30e36c7 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1071,10 +1071,15 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsLoRA, SupportsPP):
 
     # To ensure correct weight loading and mapping.
-    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
-        "lm_head.": "language_model.lm_head.",
-        "model.": "language_model.model.",
-    })
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.visual.": "visual.",
+            # mapping for original checkpoint
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        })
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-- 
GitLab


From 118ff921118cc81061a2af865a1e13840ceb6792 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 3 Jun 2025 17:29:41 +0800
Subject: [PATCH 132/274] [Doc] Update V1 user guide for embedding and enc-dec
 models (#19060)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/usage/v1_guide.md | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 3d5d7ce45..a2321bf98 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -1,5 +1,7 @@
 # vLLM V1
 
+**We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.**
+
 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
 
 To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
@@ -51,9 +53,9 @@ This living user guide outlines a few known **important changes and limitations*
 | **Spec Decode**                             | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
 | **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
 | **Structured Output Alternative Backends**  | <nobr>🟡 Planned</nobr>                                                           |
-| **Embedding Models**                        | <nobr>🟡 Planned ([RFC #12249](https://github.com/vllm-project/vllm/issues/12249))</nobr> |
+| **Embedding Models**                        | <nobr>🚧 WIP ([PR #18015](https://github.com/vllm-project/vllm/pull/18015))</nobr> |
 | **Mamba Models**                            | <nobr>🟡 Planned</nobr>                                                           |
-| **Encoder-Decoder Models**                  | <nobr>🟡 Planned</nobr>                                                           |
+| **Encoder-Decoder Models**                  | <nobr>🟠 Delayed</nobr>                                                           |
 | **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
 | **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
 | **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
@@ -63,10 +65,11 @@ This living user guide outlines a few known **important changes and limitations*
 - **🟢 Functional**: Fully operational, with ongoing optimizations.  
 - **🚧 WIP**: Under active development.  
 - **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).  
-- **🔴 Deprecated**: Not planned for v1 unless there is strong demand.
+- **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later.
+- **🔴 Deprecated**: Not planned for V1 unless there is strong demand.
 
 **Note**: vLLM V1’s unified scheduler treats both prompt and output tokens the same
-way by using a simple dictionary (e.g., {request_id: num_tokens}) to dynamically
+way by using a simple dictionary (e.g., `{request_id: num_tokens}`) to dynamically
 allocate a fixed token budget per request, enabling features like chunked prefills,
 prefix caching, and speculative decoding without a strict separation between prefill
 and decode phases.
@@ -140,7 +143,9 @@ vLLM V1 currently excludes model architectures with the `SupportsV0Only` protoco
 and the majority fall into the following categories. V1 support for these models will be added eventually.
 
 **Embedding Models**  
-Instead of having a separate model runner, hidden states processor [RFC #12249](https://github.com/vllm-project/vllm/issues/12249), which is based on global logits processor [RFC #13360](https://github.com/vllm-project/vllm/pull/13360), has been proposed to enable simultaneous generation and embedding using the same engine instance in V1. It is still in the planning stage.
+Initially, we will create a [separate model runner](https://github.com/vllm-project/vllm/pull/18015) to provide V1 support without conflicting with other ongoing work.
+
+Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) to enable simultaneous generation and embedding using the same engine instance in V1. [PR #16188](https://github.com/vllm-project/vllm/pull/16188) is the first step towards enabling this.
 
 **Mamba Models**  
 Models using selective state-space mechanisms (instead of standard transformer attention)
-- 
GitLab


From 4e88723f32f1115130566b31dba0d3c31ab1b13f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 3 Jun 2025 21:42:17 +0800
Subject: [PATCH 133/274] [doc] clarify windows support (#19088)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/getting_started/installation/gpu.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index 3c983f600..f8a3acef7 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -19,6 +19,9 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 - OS: Linux
 - Python: 3.9 -- 3.12
 
+!!! note
+    vLLM does not support Windows natively. To run vLLM on Windows, you can use the Windows Subsystem for Linux (WSL) with a compatible Linux distribution, or use some community-maintained forks, e.g. [https://github.com/SystemPanic/vllm-windows](https://github.com/SystemPanic/vllm-windows).
+
 === "NVIDIA CUDA"
 
     --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:requirements"
-- 
GitLab


From 4e68ae5e59b24fad3865eb34421b36bef4751888 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 3 Jun 2025 22:30:18 +0800
Subject: [PATCH 134/274] [CI/Build] Remove V0 LoRA test (#19066)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_add_lora.py       | 21 ++-----------------
 tests/lora/test_chatglm3_tp.py    | 10 ---------
 tests/lora/test_llama_tp.py       |  8 --------
 tests/lora/test_lora_functions.py | 34 ++++++++-----------------------
 tests/lora/test_mixtral.py        |  8 --------
 tests/lora/test_quant_model.py    |  8 --------
 tests/lora/test_qwen2vl.py        |  8 --------
 tests/lora/test_worker.py         | 10 ---------
 8 files changed, 10 insertions(+), 97 deletions(-)

diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index c8b7a5cbf..17347300b 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -6,6 +6,8 @@ import pytest
 
 import vllm.envs as env
 from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
 from vllm.inputs import TextPrompt
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
@@ -16,14 +18,6 @@ LORA_RANK = 64
 DEFAULT_MAX_LORAS = 4 * 3
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def get_lora_requests(lora_path) -> list[LoRARequest]:
     lora_requests: list[LoRARequest] = [
         LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path)
@@ -88,17 +82,6 @@ async def test_add_lora(chatglm3_lora_files):
         trust_remote_code=True,
         enforce_eager=True)
 
-    # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
-    # environment variable. reload vllm.enging.async_llm_engine as
-    # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
-    # env var.
-    import importlib
-
-    import vllm.engine.async_llm_engine
-    importlib.reload(vllm.engine.async_llm_engine)
-    from vllm.entrypoints.openai.api_server import (
-        build_async_engine_client_from_engine_args)
-
     # split lora_requests into 3 parts
     part_size = len(lora_requests) // 3
     dummy_run_requests = lora_requests[:part_size]
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index 2c18a115b..cd9526c8b 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import pytest
-
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -18,14 +16,6 @@ EXPECTED_LORA_OUTPUT = [
 ]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 580992dea..54daea5b9 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -33,14 +33,6 @@ EXPECTED_LORA_OUTPUT = [
 ]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def do_sample(llm: vllm.LLM,
               lora_path: str,
               lora_id: int,
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
index 7ae33a848..fd80f61a5 100644
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -2,26 +2,24 @@
 """
 Script to test add_lora, remove_lora, pin_lora, list_loras functions.
 """
-
-import os
-
 import pytest
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.llm_engine import LLMEngine
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
 from vllm.lora.request import LoRARequest
 
 MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
 LORA_RANK = 8
 
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
+# @pytest.fixture(autouse=True)
+# def v1(run_with_both_engines_lora):
+#     # Simple autouse wrapper to run both engines for each test
+#     # This can be promoted up to conftest.py to run for every
+#     # test in a package
+#     pass
 
 
 def make_lora_request(lora_id: int):
@@ -79,22 +77,6 @@ def test_lora_functions_sync():
 @pytest.mark.asyncio
 async def test_lora_functions_async():
 
-    if os.getenv("VLLM_USE_V1") == "0":
-        pytest.skip(
-            reason=
-            "V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions")
-
-    # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
-    # environment variable. reload vllm.enging.async_llm_engine as
-    # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
-    # env var.
-    import importlib
-
-    import vllm.engine.async_llm_engine
-    importlib.reload(vllm.engine.async_llm_engine)
-    from vllm.entrypoints.openai.api_server import (
-        build_async_engine_client_from_engine_args)
-
     max_loras = 4
     engine_args = AsyncEngineArgs(model=MODEL_PATH,
                                   enable_lora=True,
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index aea769193..4e77c5559 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -10,14 +10,6 @@ from vllm.platforms import current_platform
 MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
               prompts: list[str]) -> list[str]:
 
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 7a76ffb74..43e2975cd 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -37,14 +37,6 @@ else:
     ]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def do_sample(llm: vllm.LLM,
               lora_path: str,
               lora_id: int,
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 162714df2..20a1ae67d 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -13,14 +13,6 @@ from vllm.platforms import current_platform
 from vllm.sampling_params import BeamSearchParams
 
 
-@pytest.fixture(autouse=not current_platform.is_cpu())
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @dataclass
 class TestConfig:
     model_path: str
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index e5ae660af..1a5d52716 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -6,8 +6,6 @@ import tempfile
 from typing import Union
 from unittest.mock import patch
 
-import pytest
-
 import vllm.envs as envs
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
@@ -18,14 +16,6 @@ from vllm.v1.worker.gpu_worker import Worker as V1Worker
 from vllm.worker.worker import Worker
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
 
-- 
GitLab


From 476844d44cbf315c6c1e8431946bdecfe9823834 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 3 Jun 2025 15:39:24 +0100
Subject: [PATCH 135/274] Fix underscores in dict keys passed via CLI (#19030)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/test_utils.py | 11 +++++++++++
 vllm/utils.py       | 13 ++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index dd8777f06..42e0df1ff 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -259,11 +259,18 @@ def test_dict_args(parser):
         "--model-name=something.something",
         "--hf-overrides.key1",
         "val1",
+        # Test nesting
         "--hf-overrides.key2.key3",
         "val2",
         "--hf-overrides.key2.key4",
         "val3",
+        # Test = sign
         "--hf-overrides.key5=val4",
+        # Test underscore to dash conversion
+        "--hf_overrides.key_6",
+        "val5",
+        "--hf_overrides.key-7.key_8",
+        "val6",
     ]
     parsed_args = parser.parse_args(args)
     assert parsed_args.model_name == "something.something"
@@ -274,6 +281,10 @@ def test_dict_args(parser):
             "key4": "val3",
         },
         "key5": "val4",
+        "key_6": "val5",
+        "key-7": {
+            "key_8": "val6",
+        },
     }
 
 
diff --git a/vllm/utils.py b/vllm/utils.py
index c879b38d0..b4152e6b2 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1456,17 +1456,24 @@ class FlexibleArgumentParser(ArgumentParser):
         if '--config' in args:
             args = self._pull_args_from_config(args)
 
+        def repl(match: re.Match) -> str:
+            """Replaces underscores with dashes in the matched string."""
+            return match.group(0).replace("_", "-")
+
+        # Everything between the first -- and the first .
+        pattern = re.compile(r"(?<=--)[^\.]*")
+
         # Convert underscores to dashes and vice versa in argument names
         processed_args = []
         for arg in args:
             if arg.startswith('--'):
                 if '=' in arg:
                     key, value = arg.split('=', 1)
-                    key = '--' + key[len('--'):].replace('_', '-')
+                    key = pattern.sub(repl, key, count=1)
                     processed_args.append(f'{key}={value}')
                 else:
-                    processed_args.append('--' +
-                                          arg[len('--'):].replace('_', '-'))
+                    key = pattern.sub(repl, arg, count=1)
+                    processed_args.append(key)
             elif arg.startswith('-O') and arg != '-O' and len(arg) == 2:
                 # allow -O flag to be used without space, e.g. -O3
                 processed_args.append('-O')
-- 
GitLab


From d81edded69a5534a80785b68cde26c547cfcd4c6 Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan.turganbay@alumni.nu.edu.kz>
Date: Tue, 3 Jun 2025 17:06:04 +0200
Subject: [PATCH 136/274] [Bugfix] disable processor cache  (#19068)

Signed-off-by: raushan <raushan@huggingface.co>
---
 vllm/v1/engine/mm_input_cache.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index fcb90bebd..45fb5cd23 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -34,8 +34,8 @@ class MirroredProcessingCache:
 
     def __init__(self, model_config):
         mm_config = model_config.multimodal_config
-        disable_mm_preprocessor_cache = mm_config is not None and \
-            not mm_config.disable_mm_preprocessor_cache
+        disable_mm_preprocessor_cache = (
+            mm_config is not None and mm_config.disable_mm_preprocessor_cache)
         self.use_cache = not disable_mm_preprocessor_cache
         self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
                                                       MultiModalKwargs)
-- 
GitLab


From d00dd65cd4dbc1ebbdbe2cd070ff694e9e9321a2 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Tue, 3 Jun 2025 23:44:34 +0800
Subject: [PATCH 137/274] [Doc] Improve the Pull Request template with key
 components (#19086)

Signed-off-by: Lu Fang <lufang@fb.com>
---
 .github/PULL_REQUEST_TEMPLATE.md | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 65be771b9..c1d1e07bf 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,6 +1,15 @@
-FILL IN THE PR DESCRIPTION HERE
+## Essential Elements of an Effective PR Description Checklist
+- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
+- [ ] The test plan, such as providing test command.
+- [ ] The test results, such as pasting the results comparison before and after, or e2e results
 
-FIX #xxxx (*link existing issues this PR will resolve*)
+PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
+
+## Purpose
+
+## Test Plan
+
+## Test Result
 
 <!--- pyml disable-next-line no-emphasis-as-heading -->
 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
-- 
GitLab


From 4b7817c119e27ad9b1e1930a34006eff9680a457 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 3 Jun 2025 18:15:16 +0200
Subject: [PATCH 138/274] [Misc] Add missing `_Backend` enums (#19081)

Signed-off-by: nicklucche <nlucches@redhat.com>
---
 vllm/platforms/interface.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 5c4f7a2f7..c7a627262 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -47,6 +47,8 @@ class _Backend(enum.Enum):
     TORCH_SDPA = enum.auto()
     FLASHINFER = enum.auto()
     TRITON_MLA = enum.auto()  # Supported by V1
+    TRITON_MLA_VLLM_V1 = enum.auto()
+    FLASHMLA_VLLM_V1 = enum.auto()
     FLASHMLA = enum.auto()  # Supported by V1
     HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
-- 
GitLab


From d054da1992175787f936d18aead51bef663a0399 Mon Sep 17 00:00:00 2001
From: CYJiang <86391540+googs1025@users.noreply.github.com>
Date: Wed, 4 Jun 2025 02:02:07 +0800
Subject: [PATCH 139/274] [Misc] fix: add miss best_of param validation
 (#18555)

Signed-off-by: googs1025 <googs1025@gmail.com>
---
 vllm/sampling_params.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index dc38daa38..4294465f6 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -389,6 +389,17 @@ class SamplingParams(
                              f"type {type(self.n)}")
         if self.n < 1:
             raise ValueError(f"n must be at least 1, got {self.n}.")
+        if self.best_of is not None:
+            if not isinstance(self.best_of, int):
+                raise ValueError(
+                    f"best_of must be an integer, got {type(self.best_of)}")
+            if self.best_of < 1:
+                raise ValueError(
+                    f"best_of must be at least 1, got {self.best_of}")
+            if self.best_of < self.n:
+                raise ValueError(
+                    f"best_of must be greater than or equal to n, "
+                    f"got n={self.n} and best_of={self.best_of}.")
         if not -2.0 <= self.presence_penalty <= 2.0:
             raise ValueError("presence_penalty must be in [-2, 2], got "
                              f"{self.presence_penalty}.")
-- 
GitLab


From 02f0c7b220422792f5e53de2a7d51d2d3ff2df28 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 3 Jun 2025 11:20:17 -0700
Subject: [PATCH 140/274] [Misc] Add SPDX-FileCopyrightText  (#19100)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 .buildkite/check-wheel-size.py                              | 1 +
 .buildkite/generate_index.py                                | 1 +
 .buildkite/lm-eval-harness/conftest.py                      | 1 +
 .buildkite/lm-eval-harness/test_lm_eval_correctness.py      | 1 +
 .../scripts/convert-results-json-to-markdown.py             | 1 +
 .buildkite/nightly-benchmarks/scripts/download-tokenizer.py | 1 +
 .../nightly-benchmarks/scripts/generate-nightly-markdown.py | 1 +
 .../nightly-benchmarks/scripts/get-lmdeploy-modelname.py    | 1 +
 .../nightly-benchmarks/scripts/summary-nightly-results.py   | 1 +
 benchmarks/backend_request_func.py                          | 1 +
 benchmarks/benchmark_dataset.py                             | 1 +
 benchmarks/benchmark_latency.py                             | 1 +
 benchmarks/benchmark_long_document_qa_throughput.py         | 1 +
 benchmarks/benchmark_prefix_caching.py                      | 1 +
 benchmarks/benchmark_prioritization.py                      | 1 +
 benchmarks/benchmark_serving.py                             | 1 +
 benchmarks/benchmark_serving_structured_output.py           | 1 +
 benchmarks/benchmark_throughput.py                          | 1 +
 benchmarks/benchmark_utils.py                               | 1 +
 benchmarks/cutlass_benchmarks/sparse_benchmarks.py          | 1 +
 benchmarks/cutlass_benchmarks/utils.py                      | 1 +
 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py            | 1 +
 benchmarks/cutlass_benchmarks/weight_shapes.py              | 1 +
 benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py | 1 +
 benchmarks/disagg_benchmarks/round_robin_proxy.py           | 1 +
 benchmarks/disagg_benchmarks/visualize_benchmark_results.py | 1 +
 benchmarks/fused_kernels/layernorm_rms_benchmarks.py        | 1 +
 benchmarks/kernels/bench_fp8_gemm.py                        | 1 +
 benchmarks/kernels/benchmark_aqlm.py                        | 1 +
 benchmarks/kernels/benchmark_bitblas.py                     | 1 +
 benchmarks/kernels/benchmark_cutlass_fp4_moe.py             | 1 +
 benchmarks/kernels/benchmark_grouped_gemm_cutlass.py        | 1 +
 benchmarks/kernels/benchmark_layernorm.py                   | 1 +
 benchmarks/kernels/benchmark_lora.py                        | 1 +
 benchmarks/kernels/benchmark_machete.py                     | 1 +
 benchmarks/kernels/benchmark_marlin.py                      | 1 +
 benchmarks/kernels/benchmark_moe.py                         | 1 +
 benchmarks/kernels/benchmark_moe_permute_unpermute.py       | 1 +
 benchmarks/kernels/benchmark_paged_attention.py             | 1 +
 benchmarks/kernels/benchmark_quant.py                       | 1 +
 benchmarks/kernels/benchmark_rmsnorm.py                     | 1 +
 benchmarks/kernels/benchmark_rope.py                        | 1 +
 benchmarks/kernels/benchmark_shapes.py                      | 1 +
 benchmarks/kernels/benchmark_w8a8_block_fp8.py              | 1 +
 .../kernels/deepgemm/benchmark_fp8_block_dense_gemm.py      | 1 +
 benchmarks/kernels/graph_machete_bench.py                   | 1 +
 benchmarks/kernels/utils.py                                 | 1 +
 benchmarks/kernels/weight_shapes.py                         | 1 +
 benchmarks/overheads/benchmark_hashing.py                   | 1 +
 cmake/hipify.py                                             | 1 +
 csrc/cutlass_extensions/vllm_cutlass_library_extension.py   | 1 +
 csrc/moe/marlin_moe_wna16/generate_kernels.py               | 1 +
 csrc/quantization/gptq_marlin/generate_kernels.py           | 1 +
 csrc/quantization/machete/generate.py                       | 1 +
 docs/mkdocs/hooks/generate_examples.py                      | 1 +
 docs/mkdocs/hooks/remove_announcement.py                    | 1 +
 docs/mkdocs/hooks/url_schemes.py                            | 1 +
 examples/offline_inference/audio_language.py                | 1 +
 examples/offline_inference/automatic_prefix_caching.py      | 1 +
 examples/offline_inference/basic/basic.py                   | 1 +
 examples/offline_inference/basic/chat.py                    | 1 +
 examples/offline_inference/basic/classify.py                | 1 +
 examples/offline_inference/basic/embed.py                   | 1 +
 examples/offline_inference/basic/generate.py                | 1 +
 examples/offline_inference/basic/score.py                   | 1 +
 examples/offline_inference/batch_llm_inference.py           | 1 +
 examples/offline_inference/chat_with_tools.py               | 1 +
 examples/offline_inference/context_extension.py             | 1 +
 examples/offline_inference/data_parallel.py                 | 1 +
 .../disaggregated-prefill-v1/decode_example.py              | 1 +
 .../disaggregated-prefill-v1/prefill_example.py             | 1 +
 examples/offline_inference/disaggregated_prefill.py         | 1 +
 examples/offline_inference/eagle.py                         | 1 +
 examples/offline_inference/embed_jina_embeddings_v3.py      | 1 +
 examples/offline_inference/embed_matryoshka_fy.py           | 1 +
 examples/offline_inference/encoder_decoder.py               | 1 +
 examples/offline_inference/encoder_decoder_multimodal.py    | 1 +
 examples/offline_inference/llm_engine_example.py            | 1 +
 examples/offline_inference/load_sharded_state.py            | 1 +
 .../offline_inference/lora_with_quantization_inference.py   | 1 +
 examples/offline_inference/metrics.py                       | 1 +
 examples/offline_inference/mistral-small.py                 | 1 +
 examples/offline_inference/mlpspeculator.py                 | 1 +
 examples/offline_inference/multilora_inference.py           | 1 +
 examples/offline_inference/neuron.py                        | 1 +
 examples/offline_inference/neuron_eagle.py                  | 1 +
 examples/offline_inference/neuron_int8_quantization.py      | 1 +
 examples/offline_inference/neuron_multimodal.py             | 1 +
 examples/offline_inference/neuron_speculation.py            | 1 +
 examples/offline_inference/prefix_caching.py                | 1 +
 examples/offline_inference/prithvi_geospatial_mae.py        | 1 +
 examples/offline_inference/profiling.py                     | 1 +
 examples/offline_inference/profiling_tpu/profiling.py       | 1 +
 examples/offline_inference/prompt_embed_inference.py        | 1 +
 examples/offline_inference/qwen2_5_omni/only_thinker.py     | 1 +
 examples/offline_inference/qwen_1m.py                       | 1 +
 examples/offline_inference/reproducibility.py               | 1 +
 examples/offline_inference/rlhf.py                          | 1 +
 examples/offline_inference/rlhf_colocate.py                 | 1 +
 examples/offline_inference/rlhf_utils.py                    | 1 +
 examples/offline_inference/save_sharded_state.py            | 1 +
 examples/offline_inference/simple_profiling.py              | 1 +
 examples/offline_inference/structured_outputs.py            | 1 +
 examples/offline_inference/torchrun_example.py              | 1 +
 examples/offline_inference/tpu.py                           | 1 +
 examples/offline_inference/vision_language.py               | 1 +
 examples/offline_inference/vision_language_embedding.py     | 1 +
 examples/offline_inference/vision_language_multi_image.py   | 1 +
 examples/online_serving/api_client.py                       | 1 +
 examples/online_serving/cohere_rerank_client.py             | 1 +
 .../disaggregated_serving/disagg_proxy_demo.py              | 1 +
 examples/online_serving/gradio_openai_chatbot_webserver.py  | 1 +
 examples/online_serving/gradio_webserver.py                 | 1 +
 examples/online_serving/jinaai_rerank_client.py             | 1 +
 examples/online_serving/kv_events_subscriber.py             | 1 +
 examples/online_serving/openai_chat_completion_client.py    | 1 +
 .../openai_chat_completion_client_for_multimodal.py         | 1 +
 .../openai_chat_completion_client_with_tools.py             | 1 +
 .../openai_chat_completion_client_with_tools_required.py    | 1 +
 .../openai_chat_completion_structured_outputs.py            | 1 +
 ...nai_chat_completion_structured_outputs_structural_tag.py | 1 +
 ...nai_chat_completion_structured_outputs_with_reasoning.py | 1 +
 .../openai_chat_completion_tool_calls_with_reasoning.py     | 1 +
 .../online_serving/openai_chat_completion_with_reasoning.py | 1 +
 .../openai_chat_completion_with_reasoning_streaming.py      | 1 +
 .../openai_chat_embedding_client_for_multimodal.py          | 1 +
 examples/online_serving/openai_classification_client.py     | 1 +
 examples/online_serving/openai_completion_client.py         | 1 +
 examples/online_serving/openai_cross_encoder_score.py       | 1 +
 examples/online_serving/openai_embedding_client.py          | 1 +
 examples/online_serving/openai_embedding_matryoshka_fy.py   | 1 +
 examples/online_serving/openai_pooling_client.py            | 1 +
 examples/online_serving/openai_transcription_client.py      | 1 +
 examples/online_serving/opentelemetry/dummy_client.py       | 1 +
 .../prompt_embed_inference_with_openai_client.py            | 1 +
 examples/online_serving/ray_serve_deepseek.py               | 1 +
 .../retrieval_augmented_generation_with_langchain.py        | 1 +
 .../retrieval_augmented_generation_with_llamaindex.py       | 1 +
 .../online_serving/streamlit_openai_chatbot_webserver.py    | 1 +
 examples/online_serving/utils.py                            | 1 +
 examples/others/lmcache/cpu_offload_lmcache.py              | 1 +
 examples/others/lmcache/disagg_prefill_lmcache_v0.py        | 1 +
 .../disagg_prefill_lmcache_v1/disagg_proxy_server.py        | 1 +
 examples/others/lmcache/kv_cache_sharing_lmcache_v1.py      | 1 +
 examples/others/tensorize_vllm_model.py                     | 1 +
 find_cuda_init.py                                           | 1 +
 setup.py                                                    | 1 +
 tests/async_engine/api_server_async_engine.py               | 1 +
 tests/async_engine/conftest.py                              | 1 +
 tests/async_engine/test_api_server.py                       | 1 +
 tests/async_engine/test_async_llm_engine.py                 | 1 +
 tests/async_engine/test_request_tracker.py                  | 1 +
 tests/basic_correctness/test_basic_correctness.py           | 1 +
 tests/basic_correctness/test_chunked_prefill.py             | 1 +
 tests/basic_correctness/test_cpu_offload.py                 | 1 +
 tests/basic_correctness/test_cumem.py                       | 1 +
 tests/basic_correctness/test_preemption.py                  | 1 +
 tests/benchmarks/test_latency_cli.py                        | 1 +
 tests/benchmarks/test_serve_cli.py                          | 1 +
 tests/benchmarks/test_throughput_cli.py                     | 1 +
 tests/build_cython.py                                       | 1 +
 tests/compile/backend.py                                    | 1 +
 tests/compile/conftest.py                                   | 1 +
 tests/compile/piecewise/test_full_cudagraph.py              | 1 +
 tests/compile/piecewise/test_simple.py                      | 1 +
 tests/compile/piecewise/test_toy_llama.py                   | 1 +
 tests/compile/test_async_tp.py                              | 1 +
 tests/compile/test_basic_correctness.py                     | 1 +
 tests/compile/test_full_graph.py                            | 1 +
 tests/compile/test_functionalization.py                     | 1 +
 tests/compile/test_fusion.py                                | 1 +
 tests/compile/test_pass_manager.py                          | 1 +
 tests/compile/test_sequence_parallelism.py                  | 1 +
 tests/compile/test_silu_mul_quant_fusion.py                 | 1 +
 tests/compile/test_wrapper.py                               | 1 +
 tests/conftest.py                                           | 1 +
 tests/core/block/conftest.py                                | 1 +
 tests/core/block/e2e/conftest.py                            | 1 +
 tests/core/block/e2e/test_correctness.py                    | 1 +
 tests/core/block/e2e/test_correctness_sliding_window.py     | 1 +
 tests/core/block/test_block_manager.py                      | 1 +
 tests/core/block/test_block_table.py                        | 1 +
 tests/core/block/test_common.py                             | 1 +
 tests/core/block/test_cpu_gpu_block_allocator.py            | 1 +
 tests/core/block/test_naive_block.py                        | 1 +
 tests/core/block/test_prefix_caching_block.py               | 1 +
 tests/core/conftest.py                                      | 1 +
 tests/core/test_chunked_prefill_scheduler.py                | 1 +
 tests/core/test_num_computed_tokens_update.py               | 1 +
 tests/core/test_scheduler.py                                | 1 +
 tests/core/test_scheduler_encoder_decoder.py                | 1 +
 tests/core/test_serialization.py                            | 1 +
 tests/core/utils.py                                         | 1 +
 tests/detokenizer/conftest.py                               | 1 +
 tests/detokenizer/test_disable_detokenization.py            | 1 +
 tests/detokenizer/test_stop_checker.py                      | 1 +
 tests/detokenizer/test_stop_reason.py                       | 1 +
 tests/detokenizer/test_stop_strings.py                      | 1 +
 tests/distributed/conftest.py                               | 1 +
 tests/distributed/test_ca_buffer_sharing.py                 | 1 +
 tests/distributed/test_comm_ops.py                          | 1 +
 tests/distributed/test_custom_all_reduce.py                 | 1 +
 tests/distributed/test_distributed_oot.py                   | 1 +
 tests/distributed/test_events.py                            | 1 +
 tests/distributed/test_expert_parallel.py                   | 1 +
 tests/distributed/test_multi_node_assignment.py             | 1 +
 tests/distributed/test_pipeline_parallel.py                 | 1 +
 tests/distributed/test_pipeline_partition.py                | 1 +
 tests/distributed/test_pp_cudagraph.py                      | 1 +
 tests/distributed/test_pynccl.py                            | 1 +
 tests/distributed/test_same_node.py                         | 1 +
 tests/distributed/test_sequence_parallel.py                 | 1 +
 tests/distributed/test_shm_broadcast.py                     | 1 +
 tests/distributed/test_torchrun_example.py                  | 1 +
 tests/distributed/test_utils.py                             | 1 +
 tests/encoder_decoder/test_e2e_correctness.py               | 1 +
 tests/engine/conftest.py                                    | 1 +
 tests/engine/test_arg_utils.py                              | 1 +
 tests/engine/test_computed_prefix_blocks.py                 | 1 +
 tests/engine/test_executor.py                               | 1 +
 tests/engine/test_multi_step_output_processor.py            | 1 +
 tests/engine/test_multiproc_workers.py                      | 1 +
 tests/engine/test_options.py                                | 1 +
 tests/engine/test_short_mm_context.py                       | 1 +
 tests/entrypoints/conftest.py                               | 1 +
 tests/entrypoints/llm/test_accuracy.py                      | 1 +
 tests/entrypoints/llm/test_chat.py                          | 1 +
 tests/entrypoints/llm/test_collective_rpc.py                | 1 +
 tests/entrypoints/llm/test_encode.py                        | 1 +
 tests/entrypoints/llm/test_generate.py                      | 1 +
 tests/entrypoints/llm/test_generate_multiple_loras.py       | 1 +
 tests/entrypoints/llm/test_gpu_utilization.py               | 1 +
 tests/entrypoints/llm/test_guided_generate.py               | 1 +
 tests/entrypoints/llm/test_lazy_outlines.py                 | 1 +
 tests/entrypoints/llm/test_prompt_validation.py             | 1 +
 tests/entrypoints/offline_mode/test_offline_mode.py         | 1 +
 tests/entrypoints/openai/correctness/test_lmeval.py         | 1 +
 tests/entrypoints/openai/correctness/test_mteb.py           | 1 +
 .../correctness/test_transcription_api_correctness.py       | 1 +
 tests/entrypoints/openai/test_async_tokenization.py         | 1 +
 tests/entrypoints/openai/test_audio.py                      | 1 +
 tests/entrypoints/openai/test_basic.py                      | 1 +
 tests/entrypoints/openai/test_chat.py                       | 1 +
 tests/entrypoints/openai/test_chat_echo.py                  | 1 +
 tests/entrypoints/openai/test_chat_logit_bias_validation.py | 1 +
 tests/entrypoints/openai/test_chat_template.py              | 1 +
 tests/entrypoints/openai/test_chat_with_tool_reasoning.py   | 1 +
 tests/entrypoints/openai/test_chunked_prompt.py             | 1 +
 tests/entrypoints/openai/test_classification.py             | 1 +
 tests/entrypoints/openai/test_cli_args.py                   | 1 +
 tests/entrypoints/openai/test_completion.py                 | 1 +
 .../openai/test_completion_with_function_calling.py         | 1 +
 .../openai/test_completion_with_prompt_embeds.py            | 1 +
 tests/entrypoints/openai/test_embedding.py                  | 1 +
 tests/entrypoints/openai/test_embedding_dimensions.py       | 1 +
 tests/entrypoints/openai/test_encoder_decoder.py            | 1 +
 tests/entrypoints/openai/test_lora_adapters.py              | 1 +
 tests/entrypoints/openai/test_lora_resolvers.py             | 1 +
 tests/entrypoints/openai/test_metrics.py                    | 1 +
 tests/entrypoints/openai/test_models.py                     | 1 +
 tests/entrypoints/openai/test_oot_registration.py           | 1 +
 tests/entrypoints/openai/test_openai_schema.py              | 1 +
 tests/entrypoints/openai/test_pooling.py                    | 1 +
 tests/entrypoints/openai/test_prompt_validation.py          | 1 +
 tests/entrypoints/openai/test_rerank.py                     | 1 +
 tests/entrypoints/openai/test_return_tokens_as_ids.py       | 1 +
 tests/entrypoints/openai/test_root_path.py                  | 1 +
 tests/entrypoints/openai/test_run_batch.py                  | 1 +
 tests/entrypoints/openai/test_score.py                      | 1 +
 tests/entrypoints/openai/test_serving_chat.py               | 1 +
 tests/entrypoints/openai/test_serving_models.py             | 1 +
 tests/entrypoints/openai/test_shutdown.py                   | 1 +
 tests/entrypoints/openai/test_sleep.py                      | 1 +
 tests/entrypoints/openai/test_tensorizer_entrypoint.py      | 1 +
 tests/entrypoints/openai/test_tokenization.py               | 1 +
 tests/entrypoints/openai/test_transcription_validation.py   | 1 +
 tests/entrypoints/openai/test_truncation.py                 | 1 +
 tests/entrypoints/openai/test_video.py                      | 1 +
 tests/entrypoints/openai/test_vision.py                     | 1 +
 tests/entrypoints/openai/test_vision_embedding.py           | 1 +
 .../openai/tool_parsers/test_llama4_pythonic_tool_parser.py | 1 +
 .../openai/tool_parsers/test_pythonic_tool_parser.py        | 1 +
 tests/entrypoints/openai/tool_parsers/utils.py              | 1 +
 tests/entrypoints/test_api_server_process_manager.py        | 1 +
 tests/entrypoints/test_chat_utils.py                        | 1 +
 tests/entrypoints/test_ssl_cert_refresher.py                | 1 +
 tests/fastsafetensors_loader/test_fastsafetensors_loader.py | 1 +
 tests/fastsafetensors_loader/test_weight_utils.py           | 1 +
 tests/kernels/allclose_default.py                           | 1 +
 tests/kernels/attention/conftest.py                         | 1 +
 tests/kernels/attention/test_attention.py                   | 1 +
 tests/kernels/attention/test_attention_selector.py          | 1 +
 tests/kernels/attention/test_blocksparse_attention.py       | 1 +
 tests/kernels/attention/test_cache.py                       | 1 +
 tests/kernels/attention/test_cascade_flash_attn.py          | 1 +
 tests/kernels/attention/test_encoder_decoder_attn.py        | 1 +
 tests/kernels/attention/test_flash_attn.py                  | 1 +
 tests/kernels/attention/test_flashinfer.py                  | 1 +
 tests/kernels/attention/test_flashmla.py                    | 1 +
 tests/kernels/attention/test_lightning_attn.py              | 1 +
 tests/kernels/attention/test_merge_attn_states.py           | 1 +
 tests/kernels/attention/test_mha_attn.py                    | 1 +
 tests/kernels/attention/test_mla_decode_cpu.py              | 1 +
 tests/kernels/attention/test_prefix_prefill.py              | 1 +
 tests/kernels/attention/test_rocm_attention_selector.py     | 1 +
 tests/kernels/attention/test_triton_decode_attention.py     | 1 +
 tests/kernels/attention/test_triton_unified_attention.py    | 1 +
 tests/kernels/core/test_activation.py                       | 1 +
 tests/kernels/core/test_fused_quant_layernorm.py            | 1 +
 tests/kernels/core/test_layernorm.py                        | 1 +
 tests/kernels/core/test_opcheck.py                          | 1 +
 tests/kernels/core/test_permute_cols.py                     | 1 +
 tests/kernels/core/test_pos_encoding.py                     | 1 +
 tests/kernels/core/test_rotary_embedding.py                 | 1 +
 tests/kernels/core/test_uva.py                              | 1 +
 tests/kernels/mamba/test_causal_conv1d.py                   | 1 +
 tests/kernels/mamba/test_mamba_mixer2.py                    | 1 +
 tests/kernels/mamba/test_mamba_ssm.py                       | 1 +
 tests/kernels/mamba/test_mamba_ssm_ssd.py                   | 1 +
 tests/kernels/moe/test_batched_moe.py                       | 1 +
 tests/kernels/moe/test_cutlass_moe.py                       | 1 +
 tests/kernels/moe/test_moe.py                               | 1 +
 tests/kernels/moe/test_moe_permute_unpermute.py             | 1 +
 tests/kernels/moe/test_nvfp4_moe.py                         | 1 +
 tests/kernels/moe/test_pplx_moe.py                          | 1 +
 tests/kernels/moe/test_rocm_aiter_topk.py                   | 1 +
 tests/kernels/moe/test_triton_moe_ptpc_fp8.py               | 1 +
 tests/kernels/quant_utils.py                                | 1 +
 tests/kernels/quantization/nvfp4_utils.py                   | 1 +
 tests/kernels/quantization/test_allspark_gemm.py            | 1 +
 tests/kernels/quantization/test_aqlm.py                     | 1 +
 tests/kernels/quantization/test_awq.py                      | 1 +
 tests/kernels/quantization/test_awq_triton.py               | 1 +
 tests/kernels/quantization/test_block_fp8.py                | 1 +
 tests/kernels/quantization/test_block_int8.py               | 1 +
 tests/kernels/quantization/test_cutlass_2of4_sparse.py      | 1 +
 tests/kernels/quantization/test_cutlass_scaled_mm.py        | 1 +
 tests/kernels/quantization/test_fp8_quant.py                | 1 +
 tests/kernels/quantization/test_ggml.py                     | 1 +
 tests/kernels/quantization/test_gguf.py                     | 1 +
 tests/kernels/quantization/test_gptq.py                     | 1 +
 tests/kernels/quantization/test_int8_kernel.py              | 1 +
 tests/kernels/quantization/test_int8_quant.py               | 1 +
 tests/kernels/quantization/test_machete_mm.py               | 1 +
 tests/kernels/quantization/test_marlin_gemm.py              | 1 +
 tests/kernels/quantization/test_nvfp4_quant.py              | 1 +
 tests/kernels/quantization/test_nvfp4_scaled_mm.py          | 1 +
 tests/kernels/quantization/test_rocm_skinny_gemms.py        | 1 +
 tests/kernels/quantization/test_triton_scaled_mm.py         | 1 +
 tests/kernels/test_cutlass_mla_decode.py                    | 1 +
 tests/kernels/test_fused_quant_activation.py                | 1 +
 tests/kernels/test_triton_flash_attention.py                | 1 +
 tests/kernels/utils.py                                      | 1 +
 tests/kv_transfer/test_disagg.py                            | 1 +
 tests/kv_transfer/test_lookup_buffer.py                     | 1 +
 tests/kv_transfer/test_module.py                            | 1 +
 tests/kv_transfer/test_send_recv.py                         | 1 +
 tests/lora/conftest.py                                      | 1 +
 tests/lora/test_add_lora.py                                 | 1 +
 tests/lora/test_baichuan.py                                 | 1 +
 tests/lora/test_chatglm3_tp.py                              | 1 +
 tests/lora/test_layers.py                                   | 1 +
 tests/lora/test_llama_tp.py                                 | 1 +
 tests/lora/test_lora_allowed_token_ids.py                   | 1 +
 tests/lora/test_lora_checkpoints.py                         | 1 +
 tests/lora/test_lora_functions.py                           | 1 +
 tests/lora/test_lora_huggingface.py                         | 1 +
 tests/lora/test_lora_manager.py                             | 1 +
 tests/lora/test_minicpmv_tp.py                              | 1 +
 tests/lora/test_mixtral.py                                  | 1 +
 tests/lora/test_peft_helper.py                              | 1 +
 tests/lora/test_phi.py                                      | 1 +
 tests/lora/test_punica_ops.py                               | 1 +
 tests/lora/test_quant_model.py                              | 1 +
 tests/lora/test_qwen2vl.py                                  | 1 +
 tests/lora/test_resolver.py                                 | 1 +
 tests/lora/test_tokenizer_group.py                          | 1 +
 tests/lora/test_transfomers_model.py                        | 1 +
 tests/lora/test_utils.py                                    | 1 +
 tests/lora/test_worker.py                                   | 1 +
 tests/lora/utils.py                                         | 1 +
 tests/metrics/test_metrics.py                               | 1 +
 tests/mistral_tool_use/conftest.py                          | 1 +
 tests/mistral_tool_use/test_mistral_tool_calls.py           | 1 +
 tests/mistral_tool_use/utils.py                             | 1 +
 tests/model_executor/conftest.py                            | 1 +
 tests/model_executor/test_enabled_custom_ops.py             | 1 +
 tests/model_executor/test_guided_processors.py              | 1 +
 tests/model_executor/test_logits_processor.py               | 1 +
 tests/model_executor/test_model_load_with_params.py         | 1 +
 tests/model_executor/test_weight_utils.py                   | 1 +
 tests/models/language/generation/test_bart.py               | 1 +
 tests/models/language/generation/test_common.py             | 1 +
 tests/models/language/generation/test_granite.py            | 1 +
 tests/models/language/generation/test_granitemoehybrid.py   | 1 +
 tests/models/language/generation/test_hybrid.py             | 1 +
 tests/models/language/generation/test_mistral.py            | 1 +
 tests/models/language/generation/test_phimoe.py             | 1 +
 tests/models/language/pooling/embed_utils.py                | 1 +
 tests/models/language/pooling/mteb_utils.py                 | 1 +
 tests/models/language/pooling/test_baai.py                  | 1 +
 tests/models/language/pooling/test_classification.py        | 1 +
 tests/models/language/pooling/test_embedding.py             | 1 +
 tests/models/language/pooling/test_gritlm.py                | 1 +
 tests/models/language/pooling/test_gte.py                   | 1 +
 tests/models/language/pooling/test_jina.py                  | 1 +
 tests/models/language/pooling/test_nomic.py                 | 1 +
 tests/models/language/pooling/test_nomic_max_model_len.py   | 1 +
 tests/models/language/pooling/test_scoring.py               | 1 +
 .../models/language/pooling/test_snowflake_arctic_embed.py  | 1 +
 tests/models/language/pooling/test_truncation_control.py    | 1 +
 tests/models/multimodal/generation/test_common.py           | 1 +
 tests/models/multimodal/generation/test_florence2.py        | 1 +
 tests/models/multimodal/generation/test_granite_speech.py   | 1 +
 tests/models/multimodal/generation/test_interleaved.py      | 1 +
 tests/models/multimodal/generation/test_mllama.py           | 1 +
 tests/models/multimodal/generation/test_phi4mm.py           | 1 +
 tests/models/multimodal/generation/test_pixtral.py          | 1 +
 tests/models/multimodal/generation/test_qwen2_vl.py         | 1 +
 tests/models/multimodal/generation/test_ultravox.py         | 1 +
 tests/models/multimodal/generation/test_whisper.py          | 1 +
 tests/models/multimodal/generation/vlm_utils/builders.py    | 1 +
 .../multimodal/generation/vlm_utils/case_filtering.py       | 1 +
 tests/models/multimodal/generation/vlm_utils/core.py        | 1 +
 .../models/multimodal/generation/vlm_utils/custom_inputs.py | 1 +
 tests/models/multimodal/generation/vlm_utils/model_utils.py | 1 +
 tests/models/multimodal/generation/vlm_utils/runners.py     | 1 +
 tests/models/multimodal/generation/vlm_utils/types.py       | 1 +
 tests/models/multimodal/pooling/test_dse_qwen2_vl.py        | 1 +
 tests/models/multimodal/pooling/test_intern_vit.py          | 1 +
 tests/models/multimodal/pooling/test_llava_next.py          | 1 +
 tests/models/multimodal/pooling/test_phi3v.py               | 1 +
 tests/models/multimodal/processing/test_common.py           | 1 +
 tests/models/multimodal/processing/test_h2ovl.py            | 1 +
 tests/models/multimodal/processing/test_idefics3.py         | 1 +
 tests/models/multimodal/processing/test_internvl.py         | 1 +
 tests/models/multimodal/processing/test_llama4.py           | 1 +
 tests/models/multimodal/processing/test_llava_next.py       | 1 +
 tests/models/multimodal/processing/test_llava_onevision.py  | 1 +
 tests/models/multimodal/processing/test_minimax_vl_01.py    | 1 +
 tests/models/multimodal/processing/test_mllama.py           | 1 +
 tests/models/multimodal/processing/test_phi3v.py            | 1 +
 tests/models/multimodal/processing/test_phi4mm.py           | 1 +
 tests/models/multimodal/processing/test_qwen2_vl.py         | 1 +
 tests/models/multimodal/processing/test_smolvlm.py          | 1 +
 tests/models/quantization/test_aqlm.py                      | 1 +
 tests/models/quantization/test_awq.py                       | 1 +
 tests/models/quantization/test_bitblas.py                   | 1 +
 tests/models/quantization/test_fp8.py                       | 1 +
 tests/models/quantization/test_gguf.py                      | 1 +
 tests/models/quantization/test_gptq_bitblas.py              | 1 +
 tests/models/quantization/test_gptq_marlin.py               | 1 +
 tests/models/quantization/test_gptq_marlin_24.py            | 1 +
 tests/models/quantization/test_modelopt.py                  | 1 +
 tests/models/quantization/test_mxfp4.py                     | 1 +
 tests/models/quantization/test_nvfp4.py                     | 1 +
 tests/models/registry.py                                    | 1 +
 tests/models/test_initialization.py                         | 1 +
 tests/models/test_oot_registration.py                       | 1 +
 tests/models/test_registry.py                               | 1 +
 tests/models/test_transformers.py                           | 1 +
 tests/models/test_utils.py                                  | 1 +
 tests/models/test_vision.py                                 | 1 +
 tests/models/utils.py                                       | 1 +
 tests/mq_llm_engine/conftest.py                             | 1 +
 tests/mq_llm_engine/test_abort.py                           | 1 +
 tests/mq_llm_engine/test_error_handling.py                  | 1 +
 tests/mq_llm_engine/test_load.py                            | 1 +
 tests/mq_llm_engine/utils.py                                | 1 +
 tests/multi_step/test_correctness_async_llm.py              | 1 +
 tests/multi_step/test_correctness_llm.py                    | 1 +
 tests/multimodal/test_hasher.py                             | 1 +
 tests/multimodal/test_image.py                              | 1 +
 tests/multimodal/test_inputs.py                             | 1 +
 tests/multimodal/test_processing.py                         | 1 +
 tests/multimodal/test_utils.py                              | 1 +
 tests/multimodal/test_video.py                              | 1 +
 tests/multimodal/utils.py                                   | 1 +
 tests/neuron/1_core/test_activation.py                      | 1 +
 tests/neuron/1_core/test_block_table.py                     | 1 +
 tests/neuron/1_core/test_cache.py                           | 1 +
 tests/neuron/1_core/test_layernorm.py                       | 1 +
 tests/neuron/1_core/test_logits_processor.py                | 1 +
 tests/neuron/1_core/test_neuron_model_runner.py             | 1 +
 tests/neuron/1_core/test_neuron_quant.py                    | 1 +
 tests/neuron/1_core/test_prefix_prefill.py                  | 1 +
 tests/neuron/1_core/test_rotary_embedding.py                | 1 +
 tests/neuron/2_core/test_comm_ops.py                        | 1 +
 tests/neuron/2_core/test_eagle.py                           | 1 +
 tests/neuron/2_core/test_mistral.py                         | 1 +
 tests/neuron/2_core/test_multi_lora.py                      | 1 +
 tests/plugins/lora_resolvers/test_filesystem_resolver.py    | 1 +
 tests/plugins/vllm_add_dummy_model/setup.py                 | 1 +
 .../vllm_add_dummy_model/vllm_add_dummy_model/__init__.py   | 1 +
 .../vllm_add_dummy_model/my_gemma_embedding.py              | 1 +
 .../vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py   | 1 +
 .../vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py     | 1 +
 tests/plugins/vllm_add_dummy_platform/setup.py              | 1 +
 .../vllm_add_dummy_platform/__init__.py                     | 1 +
 .../vllm_add_dummy_platform/dummy_attention_backend.py      | 1 +
 .../vllm_add_dummy_platform/dummy_platform.py               | 1 +
 tests/plugins_tests/conftest.py                             | 1 +
 tests/plugins_tests/test_platform_plugins.py                | 1 +
 tests/plugins_tests/test_scheduler_plugins.py               | 1 +
 tests/prefix_caching/test_disable_sliding_window.py         | 1 +
 tests/prefix_caching/test_prefix_caching.py                 | 1 +
 tests/prompt_adapter/test_bloom.py                          | 1 +
 tests/prompt_adapter/test_multi_adapter_inference.py        | 1 +
 tests/prompt_adapter/test_pa_lora.py                        | 1 +
 tests/quantization/test_auto_round.py                       | 1 +
 tests/quantization/test_bitsandbytes.py                     | 1 +
 tests/quantization/test_compressed_tensors.py               | 1 +
 tests/quantization/test_configs.py                          | 1 +
 tests/quantization/test_cpu_offload.py                      | 3 ++-
 tests/quantization/test_experts_int8.py                     | 1 +
 tests/quantization/test_fp8.py                              | 1 +
 tests/quantization/test_gptq_dynamic.py                     | 1 +
 tests/quantization/test_ipex_quant.py                       | 1 +
 tests/quantization/test_lm_head.py                          | 1 +
 tests/quantization/test_ptpc_fp8.py                         | 1 +
 tests/quantization/test_quark.py                            | 1 +
 tests/quantization/test_register_quantization_config.py     | 1 +
 tests/quantization/test_torchao.py                          | 1 +
 tests/quantization/utils.py                                 | 1 +
 tests/reasoning/test_deepseekr1_reasoning_parser.py         | 1 +
 tests/reasoning/test_granite_reasoning_parser.py            | 1 +
 tests/reasoning/test_qwen3_reasoning_parser.py              | 1 +
 tests/reasoning/utils.py                                    | 1 +
 .../test_runai_model_streamer_loader.py                     | 1 +
 tests/runai_model_streamer_test/test_weight_utils.py        | 1 +
 tests/samplers/test_beam_search.py                          | 1 +
 tests/samplers/test_ignore_eos.py                           | 1 +
 tests/samplers/test_logits_processor.py                     | 1 +
 tests/samplers/test_logprobs.py                             | 1 +
 tests/samplers/test_no_bad_words.py                         | 1 +
 tests/samplers/test_ranks.py                                | 1 +
 tests/samplers/test_rejection_sampler.py                    | 1 +
 tests/samplers/test_sampler.py                              | 1 +
 tests/samplers/test_seeded_generate.py                      | 1 +
 tests/samplers/test_typical_acceptance_sampler.py           | 1 +
 tests/spec_decode/conftest.py                               | 1 +
 tests/spec_decode/e2e/conftest.py                           | 1 +
 tests/spec_decode/e2e/test_compatibility.py                 | 1 +
 tests/spec_decode/e2e/test_eagle_correctness.py             | 1 +
 tests/spec_decode/e2e/test_integration.py                   | 1 +
 tests/spec_decode/e2e/test_integration_dist_tp2.py          | 1 +
 tests/spec_decode/e2e/test_integration_dist_tp4.py          | 1 +
 tests/spec_decode/e2e/test_logprobs.py                      | 1 +
 tests/spec_decode/e2e/test_medusa_correctness.py            | 1 +
 tests/spec_decode/e2e/test_mlp_correctness.py               | 1 +
 tests/spec_decode/e2e/test_mtp_correctness.py               | 1 +
 tests/spec_decode/e2e/test_multistep_correctness.py         | 1 +
 tests/spec_decode/e2e/test_ngram_correctness.py             | 1 +
 tests/spec_decode/e2e/test_seed.py                          | 1 +
 tests/spec_decode/test_batch_expansion.py                   | 1 +
 tests/spec_decode/test_dynamic_spec_decode.py               | 1 +
 tests/spec_decode/test_memory_usage.py                      | 1 +
 tests/spec_decode/test_metrics.py                           | 1 +
 tests/spec_decode/test_multi_step_worker.py                 | 1 +
 tests/spec_decode/test_ngram_worker.py                      | 1 +
 tests/spec_decode/test_scorer.py                            | 1 +
 tests/spec_decode/test_spec_decode_worker.py                | 1 +
 tests/spec_decode/test_utils.py                             | 1 +
 tests/spec_decode/utils.py                                  | 1 +
 tests/standalone_tests/lazy_imports.py                      | 1 +
 tests/tensorizer_loader/conftest.py                         | 1 +
 tests/tensorizer_loader/test_tensorizer.py                  | 1 +
 tests/test_cache_block_hashing.py                           | 1 +
 tests/test_config.py                                        | 1 +
 tests/test_embedded_commit.py                               | 1 +
 tests/test_inputs.py                                        | 1 +
 tests/test_logger.py                                        | 1 +
 tests/test_outputs.py                                       | 1 +
 tests/test_regression.py                                    | 1 +
 tests/test_sampling_params.py                               | 1 +
 tests/test_scalartype.py                                    | 1 +
 tests/test_seed_behavior.py                                 | 3 ++-
 tests/test_sequence.py                                      | 1 +
 tests/test_sharded_state_loader.py                          | 1 +
 tests/test_triton_utils.py                                  | 1 +
 tests/test_utils.py                                         | 1 +
 tests/test_version.py                                       | 1 +
 tests/test_vllm_port.py                                     | 1 +
 tests/tokenization/test_cached_tokenizer.py                 | 1 +
 tests/tokenization/test_detokenize.py                       | 1 +
 tests/tokenization/test_get_eos.py                          | 1 +
 tests/tokenization/test_mistral_tokenizer.py                | 1 +
 tests/tokenization/test_tokenizer.py                        | 1 +
 tests/tokenization/test_tokenizer_group.py                  | 1 +
 tests/tokenization/test_tokenizer_registry.py               | 1 +
 tests/tool_use/conftest.py                                  | 1 +
 tests/tool_use/test_chat_completion_request_validations.py  | 1 +
 tests/tool_use/test_chat_completions.py                     | 1 +
 tests/tool_use/test_jamba_tool_parser.py                    | 1 +
 tests/tool_use/test_parallel_tool_calls.py                  | 1 +
 tests/tool_use/test_tool_calls.py                           | 1 +
 tests/tool_use/test_tool_choice_required.py                 | 1 +
 tests/tool_use/utils.py                                     | 1 +
 tests/tpu/lora/test_lora.py                                 | 1 +
 tests/tpu/test_compilation.py                               | 1 +
 tests/tpu/test_custom_dispatcher.py                         | 1 +
 tests/tpu/test_moe_pallas.py                                | 1 +
 tests/tpu/test_quantization_accuracy.py                     | 1 +
 tests/tracing/test_tracing.py                               | 1 +
 tests/utils.py                                              | 1 +
 tests/v1/core/test_kv_cache_utils.py                        | 1 +
 tests/v1/core/test_prefix_caching.py                        | 1 +
 tests/v1/core/test_scheduler.py                             | 1 +
 tests/v1/core/test_scheduler_e2e.py                         | 1 +
 tests/v1/core/test_specialized_manager.py                   | 1 +
 tests/v1/e2e/test_cascade_attention.py                      | 1 +
 tests/v1/e2e/test_correctness_sliding_window.py             | 1 +
 tests/v1/e2e/test_spec_decode.py                            | 1 +
 tests/v1/engine/conftest.py                                 | 1 +
 tests/v1/engine/test_async_llm.py                           | 1 +
 tests/v1/engine/test_engine_args.py                         | 1 +
 tests/v1/engine/test_engine_core.py                         | 1 +
 tests/v1/engine/test_engine_core_client.py                  | 1 +
 tests/v1/engine/test_llm_engine.py                          | 1 +
 tests/v1/engine/test_output_processor.py                    | 1 +
 tests/v1/engine/utils.py                                    | 1 +
 tests/v1/entrypoints/conftest.py                            | 1 +
 tests/v1/entrypoints/llm/test_struct_output_generate.py     | 1 +
 tests/v1/entrypoints/openai/test_chat_completion.py         | 1 +
 tests/v1/entrypoints/openai/test_completion.py              | 1 +
 tests/v1/entrypoints/openai/test_multi_api_servers.py       | 1 +
 tests/v1/kv_connector/nixl_integration/test_accuracy.py     | 1 +
 tests/v1/kv_connector/nixl_integration/test_edge_cases.py   | 1 +
 tests/v1/kv_connector/nixl_integration/toy_proxy_server.py  | 1 +
 tests/v1/kv_connector/unit/test_multi_connector.py          | 1 +
 tests/v1/kv_connector/unit/test_nixl_connector.py           | 1 +
 tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py  | 1 +
 tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py | 1 +
 tests/v1/kv_connector/unit/utils.py                         | 1 +
 tests/v1/metrics/test_ray_metrics.py                        | 1 +
 tests/v1/sample/test_logprobs.py                            | 1 +
 tests/v1/sample/test_logprobs_e2e.py                        | 1 +
 tests/v1/sample/test_rejection_sampler.py                   | 1 +
 tests/v1/sample/test_sampler.py                             | 1 +
 tests/v1/sample/test_sampling_params_e2e.py                 | 1 +
 tests/v1/sample/test_topk_topp_sampler.py                   | 1 +
 tests/v1/sample/utils.py                                    | 1 +
 tests/v1/shutdown/test_delete.py                            | 1 +
 tests/v1/shutdown/test_forward_error.py                     | 1 +
 tests/v1/shutdown/test_processor_error.py                   | 1 +
 tests/v1/shutdown/test_startup_error.py                     | 1 +
 tests/v1/shutdown/utils.py                                  | 1 +
 tests/v1/spec_decode/test_eagle.py                          | 1 +
 tests/v1/spec_decode/test_max_len.py                        | 1 +
 tests/v1/spec_decode/test_ngram.py                          | 1 +
 tests/v1/structured_output/test_utils.py                    | 1 +
 tests/v1/test_async_llm_dp.py                               | 1 +
 tests/v1/test_metrics_reader.py                             | 1 +
 tests/v1/test_oracle.py                                     | 1 +
 tests/v1/test_serial_utils.py                               | 1 +
 tests/v1/test_utils.py                                      | 1 +
 tests/v1/tpu/test_basic.py                                  | 1 +
 tests/v1/tpu/test_mha_attn.py                               | 1 +
 tests/v1/tpu/test_multimodal.py                             | 1 +
 tests/v1/tpu/test_pallas.py                                 | 1 +
 tests/v1/tpu/test_perf.py                                   | 1 +
 tests/v1/tpu/test_sampler.py                                | 1 +
 tests/v1/tpu/test_topk_topp_sampler.py                      | 1 +
 tests/v1/tpu/worker/test_tpu_model_runner.py                | 1 +
 tests/v1/worker/test_gpu_input_batch.py                     | 1 +
 tests/v1/worker/test_gpu_model_runner.py                    | 1 +
 tests/vllm_test_utils/setup.py                              | 1 +
 tests/vllm_test_utils/vllm_test_utils/__init__.py           | 1 +
 tests/vllm_test_utils/vllm_test_utils/blame.py              | 1 +
 tests/vllm_test_utils/vllm_test_utils/monitor.py            | 1 +
 tests/weight_loading/test_weight_loading.py                 | 1 +
 tests/worker/conftest.py                                    | 1 +
 tests/worker/test_encoder_decoder_model_runner.py           | 1 +
 tests/worker/test_model_input.py                            | 1 +
 tests/worker/test_model_runner.py                           | 1 +
 tests/worker/test_profile.py                                | 1 +
 tests/worker/test_swap.py                                   | 1 +
 tools/check_spdx_header.py                                  | 5 ++++-
 tools/check_triton_import.py                                | 1 +
 tools/enforce_regex_import.py                               | 1 +
 tools/profiler/print_layerwise_table.py                     | 1 +
 tools/profiler/visualize_layerwise_profile.py               | 1 +
 tools/report_build_time_ninja.py                            | 1 +
 use_existing_torch.py                                       | 1 +
 vllm/__init__.py                                            | 1 +
 vllm/_custom_ops.py                                         | 1 +
 vllm/_ipex_ops.py                                           | 1 +
 vllm/adapter_commons/layers.py                              | 1 +
 vllm/adapter_commons/models.py                              | 1 +
 vllm/adapter_commons/request.py                             | 1 +
 vllm/adapter_commons/utils.py                               | 1 +
 vllm/adapter_commons/worker_manager.py                      | 1 +
 vllm/assets/audio.py                                        | 1 +
 vllm/assets/base.py                                         | 1 +
 vllm/assets/image.py                                        | 1 +
 vllm/assets/video.py                                        | 1 +
 vllm/attention/__init__.py                                  | 1 +
 vllm/attention/backends/abstract.py                         | 1 +
 vllm/attention/backends/blocksparse_attn.py                 | 1 +
 vllm/attention/backends/cpu_mla.py                          | 1 +
 vllm/attention/backends/dual_chunk_flash_attn.py            | 1 +
 vllm/attention/backends/flash_attn.py                       | 1 +
 vllm/attention/backends/flashinfer.py                       | 1 +
 vllm/attention/backends/flashmla.py                         | 1 +
 vllm/attention/backends/hpu_attn.py                         | 1 +
 vllm/attention/backends/ipex_attn.py                        | 1 +
 vllm/attention/backends/mla/common.py                       | 1 +
 vllm/attention/backends/pallas.py                           | 1 +
 vllm/attention/backends/placeholder_attn.py                 | 1 +
 vllm/attention/backends/rocm_aiter_mla.py                   | 1 +
 vllm/attention/backends/rocm_flash_attn.py                  | 1 +
 vllm/attention/backends/torch_sdpa.py                       | 1 +
 vllm/attention/backends/triton_mla.py                       | 1 +
 vllm/attention/backends/utils.py                            | 1 +
 vllm/attention/backends/xformers.py                         | 1 +
 vllm/attention/layer.py                                     | 1 +
 .../blocksparse_attention/blocksparse_attention_kernel.py   | 1 +
 vllm/attention/ops/blocksparse_attention/interface.py       | 1 +
 vllm/attention/ops/blocksparse_attention/utils.py           | 1 +
 vllm/attention/ops/chunked_prefill_paged_decode.py          | 1 +
 vllm/attention/ops/flashmla.py                              | 1 +
 vllm/attention/ops/hpu_paged_attn.py                        | 1 +
 vllm/attention/ops/ipex_attn.py                             | 1 +
 vllm/attention/ops/merge_attn_states.py                     | 1 +
 vllm/attention/ops/nki_flash_attn.py                        | 1 +
 vllm/attention/ops/paged_attn.py                            | 1 +
 vllm/attention/ops/prefix_prefill.py                        | 1 +
 vllm/attention/ops/rocm_aiter_mla.py                        | 1 +
 vllm/attention/ops/rocm_aiter_paged_attn.py                 | 1 +
 vllm/attention/ops/triton_decode_attention.py               | 1 +
 vllm/attention/ops/triton_flash_attention.py                | 1 +
 vllm/attention/ops/triton_merge_attn_states.py              | 1 +
 vllm/attention/ops/triton_unified_attention.py              | 1 +
 vllm/attention/selector.py                                  | 1 +
 vllm/attention/utils/fa_utils.py                            | 1 +
 vllm/beam_search.py                                         | 1 +
 vllm/benchmarks/datasets.py                                 | 1 +
 vllm/benchmarks/endpoint_request_func.py                    | 1 +
 vllm/benchmarks/latency.py                                  | 1 +
 vllm/benchmarks/serve.py                                    | 1 +
 vllm/benchmarks/throughput.py                               | 1 +
 vllm/benchmarks/utils.py                                    | 1 +
 vllm/collect_env.py                                         | 6 ++++--
 vllm/compilation/activation_quant_fusion.py                 | 1 +
 vllm/compilation/backends.py                                | 1 +
 vllm/compilation/base_piecewise_backend.py                  | 1 +
 vllm/compilation/collective_fusion.py                       | 1 +
 vllm/compilation/compiler_interface.py                      | 1 +
 vllm/compilation/counter.py                                 | 1 +
 vllm/compilation/cuda_piecewise_backend.py                  | 1 +
 vllm/compilation/decorators.py                              | 1 +
 vllm/compilation/fix_functionalization.py                   | 1 +
 vllm/compilation/fusion.py                                  | 1 +
 vllm/compilation/fx_utils.py                                | 1 +
 vllm/compilation/inductor_pass.py                           | 1 +
 vllm/compilation/monitor.py                                 | 1 +
 vllm/compilation/multi_output_match.py                      | 1 +
 vllm/compilation/noop_elimination.py                        | 1 +
 vllm/compilation/pass_manager.py                            | 1 +
 vllm/compilation/sequence_parallelism.py                    | 1 +
 vllm/compilation/torch25_custom_graph_pass.py               | 1 +
 vllm/compilation/vllm_inductor_pass.py                      | 1 +
 vllm/compilation/wrapper.py                                 | 1 +
 vllm/config.py                                              | 1 +
 vllm/connections.py                                         | 1 +
 vllm/core/block/block_table.py                              | 1 +
 vllm/core/block/common.py                                   | 1 +
 vllm/core/block/cpu_gpu_block_allocator.py                  | 1 +
 vllm/core/block/interfaces.py                               | 1 +
 vllm/core/block/naive_block.py                              | 1 +
 vllm/core/block/prefix_caching_block.py                     | 1 +
 vllm/core/block/utils.py                                    | 1 +
 vllm/core/block_manager.py                                  | 1 +
 vllm/core/evictor.py                                        | 1 +
 vllm/core/interfaces.py                                     | 1 +
 vllm/core/placeholder_block_space_manager.py                | 1 +
 vllm/core/scheduler.py                                      | 1 +
 vllm/device_allocator/cumem.py                              | 1 +
 vllm/distributed/__init__.py                                | 1 +
 vllm/distributed/communication_op.py                        | 1 +
 vllm/distributed/device_communicators/all2all.py            | 1 +
 .../device_communicators/base_device_communicator.py        | 1 +
 vllm/distributed/device_communicators/cpu_communicator.py   | 1 +
 vllm/distributed/device_communicators/cuda_communicator.py  | 1 +
 vllm/distributed/device_communicators/cuda_wrapper.py       | 1 +
 vllm/distributed/device_communicators/custom_all_reduce.py  | 1 +
 .../device_communicators/custom_all_reduce_utils.py         | 1 +
 vllm/distributed/device_communicators/hpu_communicator.py   | 1 +
 .../distributed/device_communicators/neuron_communicator.py | 1 +
 vllm/distributed/device_communicators/pynccl.py             | 1 +
 vllm/distributed/device_communicators/pynccl_wrapper.py     | 1 +
 vllm/distributed/device_communicators/shm_broadcast.py      | 1 +
 vllm/distributed/device_communicators/tpu_communicator.py   | 1 +
 vllm/distributed/device_communicators/xpu_communicator.py   | 1 +
 vllm/distributed/kv_events.py                               | 1 +
 vllm/distributed/kv_transfer/__init__.py                    | 1 +
 vllm/distributed/kv_transfer/kv_connector/base.py           | 1 +
 vllm/distributed/kv_transfer/kv_connector/factory.py        | 1 +
 .../kv_transfer/kv_connector/lmcache_connector.py           | 1 +
 .../kv_transfer/kv_connector/mooncake_store_connector.py    | 1 +
 .../kv_transfer/kv_connector/simple_connector.py            | 1 +
 vllm/distributed/kv_transfer/kv_connector/utils.py          | 1 +
 vllm/distributed/kv_transfer/kv_connector/v1/__init__.py    | 1 +
 vllm/distributed/kv_transfer/kv_connector/v1/base.py        | 1 +
 .../kv_transfer/kv_connector/v1/lmcache_connector.py        | 1 +
 .../kv_transfer/kv_connector/v1/multi_connector.py          | 1 +
 .../kv_transfer/kv_connector/v1/nixl_connector.py           | 1 +
 .../kv_transfer/kv_connector/v1/shared_storage_connector.py | 1 +
 vllm/distributed/kv_transfer/kv_connector_agent.py          | 1 +
 vllm/distributed/kv_transfer/kv_lookup_buffer/base.py       | 1 +
 .../kv_transfer/kv_lookup_buffer/mooncake_store.py          | 1 +
 .../kv_transfer/kv_lookup_buffer/simple_buffer.py           | 1 +
 vllm/distributed/kv_transfer/kv_pipe/base.py                | 1 +
 vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py       | 1 +
 vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py         | 1 +
 vllm/distributed/kv_transfer/kv_transfer_state.py           | 1 +
 vllm/distributed/parallel_state.py                          | 1 +
 vllm/distributed/utils.py                                   | 1 +
 vllm/engine/arg_utils.py                                    | 1 +
 vllm/engine/async_llm_engine.py                             | 1 +
 vllm/engine/async_timeout.py                                | 1 +
 vllm/engine/llm_engine.py                                   | 1 +
 vllm/engine/metrics.py                                      | 1 +
 vllm/engine/metrics_types.py                                | 1 +
 vllm/engine/multiprocessing/__init__.py                     | 1 +
 vllm/engine/multiprocessing/client.py                       | 1 +
 vllm/engine/multiprocessing/engine.py                       | 1 +
 vllm/engine/output_processor/interfaces.py                  | 1 +
 vllm/engine/output_processor/multi_step.py                  | 1 +
 vllm/engine/output_processor/single_step.py                 | 1 +
 vllm/engine/output_processor/stop_checker.py                | 1 +
 vllm/engine/output_processor/util.py                        | 1 +
 vllm/engine/protocol.py                                     | 1 +
 vllm/entrypoints/api_server.py                              | 1 +
 vllm/entrypoints/chat_utils.py                              | 1 +
 vllm/entrypoints/cli/benchmark/base.py                      | 1 +
 vllm/entrypoints/cli/benchmark/latency.py                   | 1 +
 vllm/entrypoints/cli/benchmark/main.py                      | 1 +
 vllm/entrypoints/cli/benchmark/serve.py                     | 1 +
 vllm/entrypoints/cli/benchmark/throughput.py                | 1 +
 vllm/entrypoints/cli/collect_env.py                         | 1 +
 vllm/entrypoints/cli/main.py                                | 1 +
 vllm/entrypoints/cli/openai.py                              | 1 +
 vllm/entrypoints/cli/run_batch.py                           | 1 +
 vllm/entrypoints/cli/serve.py                               | 1 +
 vllm/entrypoints/cli/types.py                               | 1 +
 vllm/entrypoints/launcher.py                                | 1 +
 vllm/entrypoints/llm.py                                     | 1 +
 vllm/entrypoints/logger.py                                  | 1 +
 vllm/entrypoints/openai/api_server.py                       | 1 +
 vllm/entrypoints/openai/cli_args.py                         | 1 +
 vllm/entrypoints/openai/logits_processors.py                | 1 +
 vllm/entrypoints/openai/protocol.py                         | 1 +
 vllm/entrypoints/openai/run_batch.py                        | 1 +
 vllm/entrypoints/openai/serving_chat.py                     | 1 +
 vllm/entrypoints/openai/serving_classification.py           | 1 +
 vllm/entrypoints/openai/serving_completion.py               | 1 +
 vllm/entrypoints/openai/serving_embedding.py                | 1 +
 vllm/entrypoints/openai/serving_engine.py                   | 1 +
 vllm/entrypoints/openai/serving_models.py                   | 1 +
 vllm/entrypoints/openai/serving_pooling.py                  | 1 +
 vllm/entrypoints/openai/serving_score.py                    | 1 +
 vllm/entrypoints/openai/serving_tokenization.py             | 1 +
 vllm/entrypoints/openai/serving_transcription.py            | 1 +
 vllm/entrypoints/openai/tool_parsers/__init__.py            | 1 +
 .../entrypoints/openai/tool_parsers/abstract_tool_parser.py | 1 +
 .../openai/tool_parsers/deepseekv3_tool_parser.py           | 1 +
 .../openai/tool_parsers/granite_20b_fc_tool_parser.py       | 1 +
 vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py | 1 +
 vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py  | 1 +
 .../openai/tool_parsers/internlm2_tool_parser.py            | 1 +
 vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py   | 1 +
 .../openai/tool_parsers/llama4_pythonic_tool_parser.py      | 1 +
 vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py   | 1 +
 vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py | 1 +
 .../entrypoints/openai/tool_parsers/phi4mini_tool_parser.py | 1 +
 .../entrypoints/openai/tool_parsers/pythonic_tool_parser.py | 1 +
 vllm/entrypoints/openai/tool_parsers/utils.py               | 1 +
 vllm/entrypoints/score_utils.py                             | 1 +
 vllm/entrypoints/ssl.py                                     | 1 +
 vllm/entrypoints/utils.py                                   | 1 +
 vllm/env_override.py                                        | 1 +
 vllm/envs.py                                                | 1 +
 vllm/executor/executor_base.py                              | 1 +
 vllm/executor/mp_distributed_executor.py                    | 1 +
 vllm/executor/msgspec_utils.py                              | 1 +
 vllm/executor/multiproc_worker_utils.py                     | 1 +
 vllm/executor/ray_distributed_executor.py                   | 1 +
 vllm/executor/ray_utils.py                                  | 1 +
 vllm/executor/uniproc_executor.py                           | 1 +
 vllm/forward_context.py                                     | 1 +
 vllm/inputs/__init__.py                                     | 1 +
 vllm/inputs/data.py                                         | 1 +
 vllm/inputs/parse.py                                        | 1 +
 vllm/inputs/preprocess.py                                   | 1 +
 vllm/inputs/registry.py                                     | 1 +
 vllm/jsontree.py                                            | 1 +
 vllm/logger.py                                              | 1 +
 vllm/logging_utils/__init__.py                              | 1 +
 vllm/logging_utils/dump_input.py                            | 1 +
 vllm/logging_utils/formatter.py                             | 1 +
 vllm/logits_process.py                                      | 1 +
 vllm/lora/fully_sharded_layers.py                           | 1 +
 vllm/lora/layers.py                                         | 1 +
 vllm/lora/lora.py                                           | 1 +
 vllm/lora/models.py                                         | 1 +
 vllm/lora/ops/torch_ops/__init__.py                         | 1 +
 vllm/lora/ops/torch_ops/lora_ops.py                         | 1 +
 vllm/lora/ops/triton_ops/__init__.py                        | 1 +
 vllm/lora/ops/triton_ops/kernel_utils.py                    | 1 +
 vllm/lora/ops/triton_ops/lora_expand_op.py                  | 1 +
 vllm/lora/ops/triton_ops/lora_kernel_metadata.py            | 1 +
 vllm/lora/ops/triton_ops/lora_shrink_op.py                  | 1 +
 vllm/lora/ops/triton_ops/utils.py                           | 1 +
 vllm/lora/ops/xla_ops/__init__.py                           | 1 +
 vllm/lora/ops/xla_ops/lora_ops.py                           | 1 +
 vllm/lora/peft_helper.py                                    | 1 +
 vllm/lora/punica_wrapper/__init__.py                        | 1 +
 vllm/lora/punica_wrapper/punica_base.py                     | 1 +
 vllm/lora/punica_wrapper/punica_cpu.py                      | 1 +
 vllm/lora/punica_wrapper/punica_gpu.py                      | 1 +
 vllm/lora/punica_wrapper/punica_hpu.py                      | 1 +
 vllm/lora/punica_wrapper/punica_selector.py                 | 1 +
 vllm/lora/punica_wrapper/punica_tpu.py                      | 1 +
 vllm/lora/punica_wrapper/utils.py                           | 1 +
 vllm/lora/request.py                                        | 1 +
 vllm/lora/resolver.py                                       | 1 +
 vllm/lora/utils.py                                          | 1 +
 vllm/lora/worker_manager.py                                 | 1 +
 vllm/model_executor/__init__.py                             | 1 +
 vllm/model_executor/custom_op.py                            | 1 +
 vllm/model_executor/guided_decoding/__init__.py             | 1 +
 vllm/model_executor/guided_decoding/guidance_decoding.py    | 1 +
 .../guided_decoding/guidance_logits_processors.py           | 1 +
 vllm/model_executor/guided_decoding/guided_fields.py        | 1 +
 .../guided_decoding/lm_format_enforcer_decoding.py          | 1 +
 vllm/model_executor/guided_decoding/outlines_decoding.py    | 1 +
 .../guided_decoding/outlines_logits_processors.py           | 1 +
 vllm/model_executor/guided_decoding/utils.py                | 1 +
 vllm/model_executor/guided_decoding/xgrammar_decoding.py    | 1 +
 vllm/model_executor/layers/activation.py                    | 1 +
 vllm/model_executor/layers/fused_moe/__init__.py            | 1 +
 vllm/model_executor/layers/fused_moe/cutlass_moe.py         | 1 +
 vllm/model_executor/layers/fused_moe/deep_gemm_moe.py       | 1 +
 vllm/model_executor/layers/fused_moe/fused_batched_moe.py   | 1 +
 vllm/model_executor/layers/fused_moe/fused_marlin_moe.py    | 1 +
 vllm/model_executor/layers/fused_moe/fused_moe.py           | 1 +
 vllm/model_executor/layers/fused_moe/layer.py               | 1 +
 vllm/model_executor/layers/fused_moe/modular_kernel.py      | 1 +
 .../model_executor/layers/fused_moe/moe_align_block_size.py | 1 +
 vllm/model_executor/layers/fused_moe/moe_pallas.py          | 1 +
 .../layers/fused_moe/moe_permute_unpermute.py               | 1 +
 vllm/model_executor/layers/fused_moe/moe_torch_iterative.py | 1 +
 .../layers/fused_moe/pplx_prepare_finalize.py               | 1 +
 vllm/model_executor/layers/fused_moe/prepare_finalize.py    | 1 +
 .../model_executor/layers/fused_moe/rocm_aiter_fused_moe.py | 1 +
 .../model_executor/layers/fused_moe/triton_deep_gemm_moe.py | 1 +
 vllm/model_executor/layers/fused_moe/utils.py               | 1 +
 vllm/model_executor/layers/layernorm.py                     | 1 +
 vllm/model_executor/layers/lightning_attn.py                | 1 +
 vllm/model_executor/layers/linear.py                        | 1 +
 vllm/model_executor/layers/logits_processor.py              | 1 +
 vllm/model_executor/layers/mamba/mamba2_metadata.py         | 1 +
 vllm/model_executor/layers/mamba/mamba_mixer.py             | 1 +
 vllm/model_executor/layers/mamba/mamba_mixer2.py            | 1 +
 vllm/model_executor/layers/mamba/ops/causal_conv1d.py       | 1 +
 vllm/model_executor/layers/mamba/ops/mamba_ssm.py           | 1 +
 vllm/model_executor/layers/mamba/ops/ssd_bmm.py             | 1 +
 vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py      | 1 +
 vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py     | 1 +
 vllm/model_executor/layers/mamba/ops/ssd_combined.py        | 1 +
 vllm/model_executor/layers/mamba/ops/ssd_state_passing.py   | 1 +
 vllm/model_executor/layers/pooler.py                        | 1 +
 vllm/model_executor/layers/quantization/__init__.py         | 1 +
 vllm/model_executor/layers/quantization/aqlm.py             | 1 +
 vllm/model_executor/layers/quantization/auto_round.py       | 1 +
 vllm/model_executor/layers/quantization/awq.py              | 1 +
 vllm/model_executor/layers/quantization/awq_marlin.py       | 1 +
 vllm/model_executor/layers/quantization/awq_triton.py       | 1 +
 vllm/model_executor/layers/quantization/base_config.py      | 1 +
 vllm/model_executor/layers/quantization/bitblas.py          | 1 +
 vllm/model_executor/layers/quantization/bitsandbytes.py     | 1 +
 .../quantization/compressed_tensors/compressed_tensors.py   | 1 +
 .../compressed_tensors/compressed_tensors_moe.py            | 1 +
 .../quantization/compressed_tensors/schemes/__init__.py     | 1 +
 .../compressed_tensors/schemes/compressed_tensors_24.py     | 1 +
 .../compressed_tensors/schemes/compressed_tensors_scheme.py | 1 +
 .../schemes/compressed_tensors_w4a16_24.py                  | 1 +
 .../schemes/compressed_tensors_w4a16_nvfp4.py               | 1 +
 .../schemes/compressed_tensors_w8a16_fp8.py                 | 1 +
 .../schemes/compressed_tensors_w8a8_fp8.py                  | 1 +
 .../schemes/compressed_tensors_w8a8_int8.py                 | 1 +
 .../compressed_tensors/schemes/compressed_tensors_wNa16.py  | 1 +
 .../quantization/compressed_tensors/triton_scaled_mm.py     | 1 +
 .../layers/quantization/compressed_tensors/utils.py         | 1 +
 vllm/model_executor/layers/quantization/deepspeedfp.py      | 1 +
 vllm/model_executor/layers/quantization/experts_int8.py     | 1 +
 vllm/model_executor/layers/quantization/fbgemm_fp8.py       | 1 +
 vllm/model_executor/layers/quantization/fp8.py              | 1 +
 vllm/model_executor/layers/quantization/gguf.py             | 1 +
 vllm/model_executor/layers/quantization/gptq.py             | 1 +
 vllm/model_executor/layers/quantization/gptq_bitblas.py     | 1 +
 vllm/model_executor/layers/quantization/gptq_marlin.py      | 1 +
 vllm/model_executor/layers/quantization/gptq_marlin_24.py   | 1 +
 vllm/model_executor/layers/quantization/hqq_marlin.py       | 1 +
 vllm/model_executor/layers/quantization/ipex_quant.py       | 1 +
 .../quantization/kernels/mixed_precision/MPLinearKernel.py  | 1 +
 .../layers/quantization/kernels/mixed_precision/__init__.py | 1 +
 .../layers/quantization/kernels/mixed_precision/allspark.py | 1 +
 .../layers/quantization/kernels/mixed_precision/bitblas.py  | 1 +
 .../layers/quantization/kernels/mixed_precision/exllama.py  | 1 +
 .../layers/quantization/kernels/mixed_precision/machete.py  | 1 +
 .../layers/quantization/kernels/mixed_precision/marlin.py   | 1 +
 .../quantization/kernels/scaled_mm/ScaledMMLinearKernel.py  | 1 +
 .../layers/quantization/kernels/scaled_mm/__init__.py       | 1 +
 .../layers/quantization/kernels/scaled_mm/aiter.py          | 1 +
 .../layers/quantization/kernels/scaled_mm/cutlass.py        | 1 +
 .../layers/quantization/kernels/scaled_mm/triton.py         | 1 +
 .../layers/quantization/kernels/scaled_mm/xla.py            | 1 +
 vllm/model_executor/layers/quantization/kv_cache.py         | 1 +
 vllm/model_executor/layers/quantization/marlin.py           | 1 +
 vllm/model_executor/layers/quantization/modelopt.py         | 1 +
 vllm/model_executor/layers/quantization/moe_wna16.py        | 1 +
 vllm/model_executor/layers/quantization/neuron_quant.py     | 1 +
 vllm/model_executor/layers/quantization/ptpc_fp8.py         | 1 +
 vllm/model_executor/layers/quantization/qqq.py              | 1 +
 vllm/model_executor/layers/quantization/quark/quark.py      | 1 +
 vllm/model_executor/layers/quantization/quark/quark_moe.py  | 1 +
 .../layers/quantization/quark/schemes/__init__.py           | 1 +
 .../layers/quantization/quark/schemes/quark_scheme.py       | 1 +
 .../layers/quantization/quark/schemes/quark_w4a4_mxfp4.py   | 1 +
 .../layers/quantization/quark/schemes/quark_w8a8_fp8.py     | 1 +
 .../layers/quantization/quark/schemes/quark_w8a8_int8.py    | 1 +
 vllm/model_executor/layers/quantization/quark/utils.py      | 1 +
 vllm/model_executor/layers/quantization/schema.py           | 1 +
 vllm/model_executor/layers/quantization/torchao.py          | 1 +
 vllm/model_executor/layers/quantization/tpu_int8.py         | 1 +
 vllm/model_executor/layers/quantization/utils/__init__.py   | 1 +
 .../layers/quantization/utils/allspark_utils.py             | 1 +
 .../layers/quantization/utils/bitblas_utils.py              | 1 +
 vllm/model_executor/layers/quantization/utils/fp8_utils.py  | 1 +
 vllm/model_executor/layers/quantization/utils/gptq_utils.py | 1 +
 vllm/model_executor/layers/quantization/utils/int8_utils.py | 1 +
 .../model_executor/layers/quantization/utils/layer_utils.py | 1 +
 .../layers/quantization/utils/machete_utils.py              | 1 +
 .../layers/quantization/utils/marlin_utils.py               | 1 +
 .../layers/quantization/utils/marlin_utils_fp4.py           | 1 +
 .../layers/quantization/utils/marlin_utils_fp8.py           | 1 +
 .../layers/quantization/utils/marlin_utils_test.py          | 1 +
 .../layers/quantization/utils/marlin_utils_test_24.py       | 1 +
 .../layers/quantization/utils/marlin_utils_test_qqq.py      | 1 +
 .../model_executor/layers/quantization/utils/mxfp4_utils.py | 1 +
 .../layers/quantization/utils/nvfp4_emulation_utils.py      | 1 +
 .../model_executor/layers/quantization/utils/quant_utils.py | 1 +
 vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 1 +
 vllm/model_executor/layers/rejection_sampler.py             | 1 +
 vllm/model_executor/layers/resampler.py                     | 1 +
 vllm/model_executor/layers/rotary_embedding.py              | 1 +
 vllm/model_executor/layers/sampler.py                       | 1 +
 vllm/model_executor/layers/spec_decode_base_sampler.py      | 1 +
 vllm/model_executor/layers/typical_acceptance_sampler.py    | 1 +
 vllm/model_executor/layers/utils.py                         | 1 +
 vllm/model_executor/layers/vocab_parallel_embedding.py      | 1 +
 vllm/model_executor/model_loader/__init__.py                | 1 +
 vllm/model_executor/model_loader/base_loader.py             | 1 +
 vllm/model_executor/model_loader/bitsandbytes_loader.py     | 1 +
 vllm/model_executor/model_loader/default_loader.py          | 1 +
 vllm/model_executor/model_loader/dummy_loader.py            | 1 +
 vllm/model_executor/model_loader/gguf_loader.py             | 1 +
 vllm/model_executor/model_loader/neuron.py                  | 1 +
 vllm/model_executor/model_loader/neuronx_distributed.py     | 1 +
 vllm/model_executor/model_loader/runai_streamer_loader.py   | 1 +
 vllm/model_executor/model_loader/sharded_state_loader.py    | 1 +
 vllm/model_executor/model_loader/tensorizer.py              | 1 +
 vllm/model_executor/model_loader/tensorizer_loader.py       | 1 +
 vllm/model_executor/model_loader/utils.py                   | 1 +
 vllm/model_executor/model_loader/weight_utils.py            | 1 +
 vllm/model_executor/models/__init__.py                      | 1 +
 vllm/model_executor/models/adapters.py                      | 1 +
 vllm/model_executor/models/aimv2.py                         | 1 +
 vllm/model_executor/models/arctic.py                        | 1 +
 vllm/model_executor/models/aria.py                          | 1 +
 vllm/model_executor/models/aya_vision.py                    | 3 ++-
 vllm/model_executor/models/baichuan.py                      | 1 +
 vllm/model_executor/models/bamba.py                         | 1 +
 vllm/model_executor/models/bart.py                          | 1 +
 vllm/model_executor/models/bert.py                          | 1 +
 vllm/model_executor/models/bert_with_rope.py                | 1 +
 vllm/model_executor/models/blip.py                          | 1 +
 vllm/model_executor/models/blip2.py                         | 1 +
 vllm/model_executor/models/bloom.py                         | 1 +
 vllm/model_executor/models/chameleon.py                     | 1 +
 vllm/model_executor/models/chatglm.py                       | 1 +
 vllm/model_executor/models/clip.py                          | 1 +
 vllm/model_executor/models/commandr.py                      | 1 +
 vllm/model_executor/models/constant_size_cache.py           | 1 +
 vllm/model_executor/models/dbrx.py                          | 1 +
 vllm/model_executor/models/deepseek.py                      | 1 +
 vllm/model_executor/models/deepseek_mtp.py                  | 1 +
 vllm/model_executor/models/deepseek_v2.py                   | 1 +
 vllm/model_executor/models/deepseek_vl2.py                  | 1 +
 vllm/model_executor/models/eagle.py                         | 1 +
 vllm/model_executor/models/exaone.py                        | 1 +
 vllm/model_executor/models/fairseq2_llama.py                | 1 +
 vllm/model_executor/models/falcon.py                        | 1 +
 vllm/model_executor/models/falcon_h1.py                     | 1 +
 vllm/model_executor/models/florence2.py                     | 1 +
 vllm/model_executor/models/fuyu.py                          | 1 +
 vllm/model_executor/models/gemma.py                         | 1 +
 vllm/model_executor/models/gemma2.py                        | 1 +
 vllm/model_executor/models/gemma3.py                        | 1 +
 vllm/model_executor/models/gemma3_mm.py                     | 1 +
 vllm/model_executor/models/glm.py                           | 1 +
 vllm/model_executor/models/glm4.py                          | 1 +
 vllm/model_executor/models/glm4v.py                         | 1 +
 vllm/model_executor/models/gpt2.py                          | 1 +
 vllm/model_executor/models/gpt_bigcode.py                   | 1 +
 vllm/model_executor/models/gpt_j.py                         | 1 +
 vllm/model_executor/models/gpt_neox.py                      | 1 +
 vllm/model_executor/models/granite.py                       | 1 +
 vllm/model_executor/models/granite_speech.py                | 1 +
 vllm/model_executor/models/granitemoe.py                    | 1 +
 vllm/model_executor/models/granitemoehybrid.py              | 1 +
 vllm/model_executor/models/granitemoeshared.py              | 1 +
 vllm/model_executor/models/gritlm.py                        | 1 +
 vllm/model_executor/models/grok1.py                         | 1 +
 vllm/model_executor/models/h2ovl.py                         | 1 +
 vllm/model_executor/models/idefics2_vision_model.py         | 1 +
 vllm/model_executor/models/idefics3.py                      | 1 +
 vllm/model_executor/models/interfaces.py                    | 1 +
 vllm/model_executor/models/interfaces_base.py               | 1 +
 vllm/model_executor/models/intern_vit.py                    | 1 +
 vllm/model_executor/models/internlm2.py                     | 1 +
 vllm/model_executor/models/internlm2_ve.py                  | 1 +
 vllm/model_executor/models/internvl.py                      | 1 +
 vllm/model_executor/models/jais.py                          | 1 +
 vllm/model_executor/models/jamba.py                         | 1 +
 vllm/model_executor/models/kimi_vl.py                       | 1 +
 vllm/model_executor/models/llama.py                         | 1 +
 vllm/model_executor/models/llama4.py                        | 1 +
 vllm/model_executor/models/llama_eagle.py                   | 1 +
 vllm/model_executor/models/llama_eagle3.py                  | 1 +
 vllm/model_executor/models/llava.py                         | 1 +
 vllm/model_executor/models/llava_next.py                    | 1 +
 vllm/model_executor/models/llava_next_video.py              | 1 +
 vllm/model_executor/models/llava_onevision.py               | 1 +
 vllm/model_executor/models/mamba.py                         | 1 +
 vllm/model_executor/models/mamba2.py                        | 1 +
 vllm/model_executor/models/mamba_cache.py                   | 1 +
 vllm/model_executor/models/medusa.py                        | 1 +
 vllm/model_executor/models/mimo.py                          | 1 +
 vllm/model_executor/models/mimo_mtp.py                      | 1 +
 vllm/model_executor/models/minicpm.py                       | 1 +
 vllm/model_executor/models/minicpm3.py                      | 1 +
 vllm/model_executor/models/minicpm_eagle.py                 | 1 +
 vllm/model_executor/models/minicpmo.py                      | 1 +
 vllm/model_executor/models/minicpmv.py                      | 1 +
 vllm/model_executor/models/minimax_cache.py                 | 1 +
 vllm/model_executor/models/minimax_text_01.py               | 1 +
 vllm/model_executor/models/minimax_vl_01.py                 | 1 +
 vllm/model_executor/models/mistral3.py                      | 1 +
 vllm/model_executor/models/mixtral.py                       | 1 +
 vllm/model_executor/models/mixtral_quant.py                 | 1 +
 vllm/model_executor/models/mllama.py                        | 1 +
 vllm/model_executor/models/mllama4.py                       | 1 +
 vllm/model_executor/models/mlp_speculator.py                | 1 +
 vllm/model_executor/models/modernbert.py                    | 1 +
 vllm/model_executor/models/module_mapping.py                | 1 +
 vllm/model_executor/models/molmo.py                         | 1 +
 vllm/model_executor/models/moonvit.py                       | 1 +
 vllm/model_executor/models/mpt.py                           | 1 +
 vllm/model_executor/models/nemotron.py                      | 1 +
 vllm/model_executor/models/nemotron_nas.py                  | 1 +
 vllm/model_executor/models/nvlm_d.py                        | 1 +
 vllm/model_executor/models/olmo.py                          | 1 +
 vllm/model_executor/models/olmo2.py                         | 1 +
 vllm/model_executor/models/olmoe.py                         | 1 +
 vllm/model_executor/models/opt.py                           | 1 +
 vllm/model_executor/models/orion.py                         | 1 +
 vllm/model_executor/models/ovis.py                          | 1 +
 vllm/model_executor/models/paligemma.py                     | 1 +
 vllm/model_executor/models/persimmon.py                     | 1 +
 vllm/model_executor/models/phi.py                           | 1 +
 vllm/model_executor/models/phi3.py                          | 1 +
 vllm/model_executor/models/phi3_small.py                    | 1 +
 vllm/model_executor/models/phi3v.py                         | 1 +
 vllm/model_executor/models/phi4mm.py                        | 1 +
 vllm/model_executor/models/phi4mm_audio.py                  | 1 +
 vllm/model_executor/models/phi4mm_utils.py                  | 1 +
 vllm/model_executor/models/phimoe.py                        | 1 +
 vllm/model_executor/models/pixtral.py                       | 1 +
 vllm/model_executor/models/plamo2.py                        | 1 +
 vllm/model_executor/models/prithvi_geospatial_mae.py        | 1 +
 vllm/model_executor/models/qwen.py                          | 1 +
 vllm/model_executor/models/qwen2.py                         | 1 +
 vllm/model_executor/models/qwen2_5_omni_thinker.py          | 1 +
 vllm/model_executor/models/qwen2_5_vl.py                    | 1 +
 vllm/model_executor/models/qwen2_audio.py                   | 1 +
 vllm/model_executor/models/qwen2_moe.py                     | 1 +
 vllm/model_executor/models/qwen2_rm.py                      | 1 +
 vllm/model_executor/models/qwen2_vl.py                      | 1 +
 vllm/model_executor/models/qwen3.py                         | 1 +
 vllm/model_executor/models/qwen3_moe.py                     | 1 +
 vllm/model_executor/models/qwen_vl.py                       | 1 +
 vllm/model_executor/models/registry.py                      | 1 +
 vllm/model_executor/models/roberta.py                       | 1 +
 vllm/model_executor/models/siglip.py                        | 1 +
 vllm/model_executor/models/skyworkr1v.py                    | 1 +
 vllm/model_executor/models/smolvlm.py                       | 1 +
 vllm/model_executor/models/solar.py                         | 1 +
 vllm/model_executor/models/stablelm.py                      | 1 +
 vllm/model_executor/models/starcoder2.py                    | 1 +
 vllm/model_executor/models/telechat2.py                     | 1 +
 vllm/model_executor/models/teleflm.py                       | 1 +
 vllm/model_executor/models/transformers.py                  | 1 +
 vllm/model_executor/models/ultravox.py                      | 1 +
 vllm/model_executor/models/utils.py                         | 1 +
 vllm/model_executor/models/vision.py                        | 1 +
 vllm/model_executor/models/whisper.py                       | 1 +
 vllm/model_executor/models/zamba2.py                        | 1 +
 vllm/model_executor/parameter.py                            | 1 +
 vllm/model_executor/pooling_metadata.py                     | 1 +
 vllm/model_executor/sampling_metadata.py                    | 1 +
 vllm/model_executor/utils.py                                | 1 +
 vllm/multimodal/__init__.py                                 | 1 +
 vllm/multimodal/audio.py                                    | 1 +
 vllm/multimodal/base.py                                     | 1 +
 vllm/multimodal/hasher.py                                   | 1 +
 vllm/multimodal/image.py                                    | 1 +
 vllm/multimodal/inputs.py                                   | 1 +
 vllm/multimodal/parse.py                                    | 1 +
 vllm/multimodal/processing.py                               | 1 +
 vllm/multimodal/profiling.py                                | 1 +
 vllm/multimodal/registry.py                                 | 1 +
 vllm/multimodal/utils.py                                    | 1 +
 vllm/multimodal/video.py                                    | 1 +
 vllm/outputs.py                                             | 1 +
 vllm/platforms/__init__.py                                  | 1 +
 vllm/platforms/cpu.py                                       | 1 +
 vllm/platforms/cuda.py                                      | 1 +
 vllm/platforms/hpu.py                                       | 1 +
 vllm/platforms/interface.py                                 | 1 +
 vllm/platforms/neuron.py                                    | 1 +
 vllm/platforms/rocm.py                                      | 1 +
 vllm/platforms/tpu.py                                       | 1 +
 vllm/platforms/xpu.py                                       | 1 +
 vllm/plugins/__init__.py                                    | 1 +
 vllm/plugins/lora_resolvers/filesystem_resolver.py          | 1 +
 vllm/pooling_params.py                                      | 1 +
 vllm/profiler/layerwise_profile.py                          | 1 +
 vllm/profiler/utils.py                                      | 1 +
 vllm/prompt_adapter/layers.py                               | 1 +
 vllm/prompt_adapter/models.py                               | 1 +
 vllm/prompt_adapter/request.py                              | 1 +
 vllm/prompt_adapter/utils.py                                | 1 +
 vllm/prompt_adapter/worker_manager.py                       | 1 +
 vllm/reasoning/__init__.py                                  | 1 +
 vllm/reasoning/abs_reasoning_parsers.py                     | 1 +
 vllm/reasoning/deepseek_r1_reasoning_parser.py              | 1 +
 vllm/reasoning/granite_reasoning_parser.py                  | 1 +
 vllm/reasoning/qwen3_reasoning_parser.py                    | 1 +
 vllm/sampling_params.py                                     | 1 +
 vllm/scalar_type.py                                         | 1 +
 vllm/scripts.py                                             | 1 +
 vllm/sequence.py                                            | 1 +
 vllm/spec_decode/batch_expansion.py                         | 1 +
 vllm/spec_decode/draft_model_runner.py                      | 1 +
 vllm/spec_decode/interfaces.py                              | 1 +
 vllm/spec_decode/medusa_worker.py                           | 1 +
 vllm/spec_decode/metrics.py                                 | 1 +
 vllm/spec_decode/mlp_speculator_worker.py                   | 1 +
 vllm/spec_decode/mqa_scorer.py                              | 1 +
 vllm/spec_decode/multi_step_worker.py                       | 1 +
 vllm/spec_decode/ngram_worker.py                            | 1 +
 vllm/spec_decode/proposer_worker_base.py                    | 1 +
 vllm/spec_decode/smaller_tp_proposer_worker.py              | 1 +
 vllm/spec_decode/spec_decode_worker.py                      | 1 +
 vllm/spec_decode/target_model_runner.py                     | 1 +
 vllm/spec_decode/top1_proposer.py                           | 1 +
 vllm/spec_decode/util.py                                    | 1 +
 vllm/test_utils.py                                          | 1 +
 vllm/third_party/pynvml.py                                  | 1 +
 vllm/tracing.py                                             | 1 +
 vllm/transformers_utils/__init__.py                         | 1 +
 vllm/transformers_utils/chat_templates/__init__.py          | 1 +
 vllm/transformers_utils/chat_templates/registry.py          | 1 +
 vllm/transformers_utils/config.py                           | 1 +
 vllm/transformers_utils/configs/__init__.py                 | 1 +
 vllm/transformers_utils/configs/arctic.py                   | 1 +
 vllm/transformers_utils/configs/chatglm.py                  | 1 +
 vllm/transformers_utils/configs/cohere2.py                  | 1 +
 vllm/transformers_utils/configs/dbrx.py                     | 1 +
 vllm/transformers_utils/configs/deepseek_vl2.py             | 1 +
 vllm/transformers_utils/configs/eagle.py                    | 1 +
 vllm/transformers_utils/configs/exaone.py                   | 1 +
 vllm/transformers_utils/configs/falcon.py                   | 1 +
 vllm/transformers_utils/configs/h2ovl.py                    | 1 +
 vllm/transformers_utils/configs/internvl.py                 | 1 +
 vllm/transformers_utils/configs/jais.py                     | 1 +
 vllm/transformers_utils/configs/kimi_vl.py                  | 1 +
 vllm/transformers_utils/configs/medusa.py                   | 1 +
 vllm/transformers_utils/configs/minimax_text_01.py          | 1 +
 vllm/transformers_utils/configs/minimax_vl_01.py            | 1 +
 vllm/transformers_utils/configs/mllama.py                   | 1 +
 vllm/transformers_utils/configs/mlp_speculator.py           | 1 +
 vllm/transformers_utils/configs/moonvit.py                  | 1 +
 vllm/transformers_utils/configs/mpt.py                      | 1 +
 vllm/transformers_utils/configs/nemotron.py                 | 1 +
 vllm/transformers_utils/configs/nvlm_d.py                   | 1 +
 vllm/transformers_utils/configs/ovis.py                     | 1 +
 vllm/transformers_utils/configs/skyworkr1v.py               | 1 +
 vllm/transformers_utils/configs/solar.py                    | 1 +
 vllm/transformers_utils/configs/telechat2.py                | 1 +
 vllm/transformers_utils/configs/ultravox.py                 | 1 +
 vllm/transformers_utils/detokenizer.py                      | 1 +
 vllm/transformers_utils/detokenizer_utils.py                | 1 +
 vllm/transformers_utils/processor.py                        | 1 +
 vllm/transformers_utils/processors/__init__.py              | 1 +
 vllm/transformers_utils/processors/deepseek_vl2.py          | 1 +
 vllm/transformers_utils/processors/ovis.py                  | 1 +
 vllm/transformers_utils/s3_utils.py                         | 1 +
 vllm/transformers_utils/tokenizer.py                        | 1 +
 vllm/transformers_utils/tokenizer_base.py                   | 1 +
 vllm/transformers_utils/tokenizer_group.py                  | 1 +
 vllm/transformers_utils/tokenizers/__init__.py              | 1 +
 vllm/transformers_utils/tokenizers/mistral.py               | 1 +
 vllm/transformers_utils/utils.py                            | 1 +
 vllm/triton_utils/__init__.py                               | 1 +
 vllm/triton_utils/importing.py                              | 1 +
 vllm/usage/usage_lib.py                                     | 1 +
 vllm/utils.py                                               | 1 +
 vllm/v1/attention/backends/flash_attn.py                    | 1 +
 vllm/v1/attention/backends/flashinfer.py                    | 1 +
 vllm/v1/attention/backends/mla/common.py                    | 1 +
 vllm/v1/attention/backends/mla/flashmla.py                  | 1 +
 vllm/v1/attention/backends/mla/rocm_aiter_mla.py            | 1 +
 vllm/v1/attention/backends/mla/triton_mla.py                | 1 +
 vllm/v1/attention/backends/pallas.py                        | 1 +
 vllm/v1/attention/backends/triton_attn.py                   | 1 +
 vllm/v1/attention/backends/utils.py                         | 1 +
 vllm/v1/core/block_pool.py                                  | 1 +
 vllm/v1/core/encoder_cache_manager.py                       | 1 +
 vllm/v1/core/kv_cache_manager.py                            | 1 +
 vllm/v1/core/kv_cache_utils.py                              | 1 +
 vllm/v1/core/sched/interface.py                             | 1 +
 vllm/v1/core/sched/output.py                                | 1 +
 vllm/v1/core/sched/scheduler.py                             | 1 +
 vllm/v1/core/sched/utils.py                                 | 1 +
 vllm/v1/core/single_type_kv_cache_manager.py                | 1 +
 vllm/v1/engine/__init__.py                                  | 1 +
 vllm/v1/engine/async_llm.py                                 | 1 +
 vllm/v1/engine/coordinator.py                               | 1 +
 vllm/v1/engine/core.py                                      | 1 +
 vllm/v1/engine/core_client.py                               | 1 +
 vllm/v1/engine/detokenizer.py                               | 1 +
 vllm/v1/engine/exceptions.py                                | 1 +
 vllm/v1/engine/llm_engine.py                                | 1 +
 vllm/v1/engine/logprobs.py                                  | 1 +
 vllm/v1/engine/mm_input_cache.py                            | 1 +
 vllm/v1/engine/output_processor.py                          | 1 +
 vllm/v1/engine/parallel_sampling.py                         | 1 +
 vllm/v1/engine/processor.py                                 | 1 +
 vllm/v1/executor/abstract.py                                | 1 +
 vllm/v1/executor/multiproc_executor.py                      | 1 +
 vllm/v1/executor/ray_distributed_executor.py                | 1 +
 vllm/v1/kv_cache_interface.py                               | 1 +
 vllm/v1/metrics/loggers.py                                  | 1 +
 vllm/v1/metrics/prometheus.py                               | 1 +
 vllm/v1/metrics/ray_wrappers.py                             | 1 +
 vllm/v1/metrics/reader.py                                   | 1 +
 vllm/v1/metrics/stats.py                                    | 1 +
 vllm/v1/outputs.py                                          | 1 +
 vllm/v1/request.py                                          | 1 +
 vllm/v1/sample/metadata.py                                  | 1 +
 vllm/v1/sample/ops/bad_words.py                             | 1 +
 vllm/v1/sample/ops/penalties.py                             | 1 +
 vllm/v1/sample/ops/topk_topp_sampler.py                     | 1 +
 vllm/v1/sample/rejection_sampler.py                         | 1 +
 vllm/v1/sample/sampler.py                                   | 1 +
 vllm/v1/sample/tpu/metadata.py                              | 1 +
 vllm/v1/sample/tpu/sampler.py                               | 1 +
 vllm/v1/serial_utils.py                                     | 1 +
 vllm/v1/spec_decode/eagle.py                                | 1 +
 vllm/v1/spec_decode/medusa.py                               | 1 +
 vllm/v1/spec_decode/metadata.py                             | 1 +
 vllm/v1/spec_decode/metrics.py                              | 1 +
 vllm/v1/spec_decode/ngram_proposer.py                       | 1 +
 vllm/v1/spec_decode/utils.py                                | 1 +
 vllm/v1/structured_output/__init__.py                       | 1 +
 vllm/v1/structured_output/backend_guidance.py               | 1 +
 vllm/v1/structured_output/backend_types.py                  | 1 +
 vllm/v1/structured_output/backend_xgrammar.py               | 1 +
 vllm/v1/structured_output/request.py                        | 1 +
 vllm/v1/structured_output/utils.py                          | 1 +
 vllm/v1/utils.py                                            | 1 +
 vllm/v1/worker/block_table.py                               | 1 +
 vllm/v1/worker/gpu_input_batch.py                           | 1 +
 vllm/v1/worker/gpu_model_runner.py                          | 1 +
 vllm/v1/worker/gpu_worker.py                                | 1 +
 vllm/v1/worker/lora_model_runner_mixin.py                   | 1 +
 vllm/v1/worker/tpu_model_runner.py                          | 1 +
 vllm/v1/worker/tpu_worker.py                                | 1 +
 vllm/v1/worker/utils.py                                     | 1 +
 vllm/v1/worker/worker_base.py                               | 1 +
 vllm/version.py                                             | 1 +
 vllm/worker/cache_engine.py                                 | 1 +
 vllm/worker/cpu_enc_dec_model_runner.py                     | 1 +
 vllm/worker/cpu_model_runner.py                             | 1 +
 vllm/worker/cpu_pooling_model_runner.py                     | 1 +
 vllm/worker/cpu_worker.py                                   | 1 +
 vllm/worker/enc_dec_model_runner.py                         | 1 +
 vllm/worker/hpu_model_runner.py                             | 1 +
 vllm/worker/hpu_worker.py                                   | 1 +
 vllm/worker/model_runner.py                                 | 1 +
 vllm/worker/model_runner_base.py                            | 1 +
 vllm/worker/multi_step_hpu_worker.py                        | 1 +
 vllm/worker/multi_step_model_runner.py                      | 1 +
 vllm/worker/multi_step_neuron_model_runner.py               | 1 +
 vllm/worker/multi_step_neuronx_distributed_model_runner.py  | 1 +
 vllm/worker/multi_step_tpu_worker.py                        | 1 +
 vllm/worker/multi_step_worker.py                            | 1 +
 vllm/worker/neuron_model_runner.py                          | 1 +
 vllm/worker/neuron_worker.py                                | 1 +
 vllm/worker/neuronx_distributed_model_runner.py             | 1 +
 vllm/worker/pooling_model_runner.py                         | 1 +
 vllm/worker/tpu_model_runner.py                             | 1 +
 vllm/worker/tpu_worker.py                                   | 1 +
 vllm/worker/utils.py                                        | 1 +
 vllm/worker/worker.py                                       | 1 +
 vllm/worker/worker_base.py                                  | 1 +
 vllm/worker/xpu_model_runner.py                             | 1 +
 vllm/worker/xpu_worker.py                                   | 1 +
 1432 files changed, 1441 insertions(+), 6 deletions(-)

diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index e29881fcb..68aff793a 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import sys
diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
index 270663c41..7045d8810 100644
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import os
diff --git a/.buildkite/lm-eval-harness/conftest.py b/.buildkite/lm-eval-harness/conftest.py
index 769d2efda..c0d60dd53 100644
--- a/.buildkite/lm-eval-harness/conftest.py
+++ b/.buildkite/lm-eval-harness/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 
 import pytest
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 409a6ca82..930adfaf3 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 7f2a2d8dc..a4f1638c1 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import os
diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
index 778a3a8d8..8532ff7ef 100644
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 
diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
index 10a7a2f5a..053fd52c3 100644
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import json
diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
index e5f179a0f..ddea1d2b1 100644
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from lmdeploy.serve.openai.api_client import APIClient
 
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index 2a7b37991..fb3b9d5e3 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import datetime
 import json
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 85e6eda7f..ddb38e304 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import io
 import json
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index d86bf045e..80a9246aa 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This module defines a framework for sampling benchmark requests from various
 datasets. Each dataset subclass of BenchmarkDataset must implement sample
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index de62bf5c6..c06857247 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark the latency of processing a single batch of requests."""
 
 import argparse
diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py
index 109624c87..00869fa94 100644
--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Offline benchmark to test the long document QA throughput.
 
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index ffaa80357..3e4704f0b 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Benchmark the efficiency of prefix caching.
 
diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
index a05dd24de..5496703f2 100644
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark offline prioritization."""
 
 import argparse
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 6bd9f1b49..81428fb7d 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 r"""Benchmark online serving throughput.
 
 On the server side, run one of the following commands:
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 6a50f47d3..3848ebda9 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 r"""Benchmark online serving throughput with structured outputs.
 
 On the server side, run one of the following commands:
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 7a13babda..d19753d40 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark offline inference throughput."""
 
 import argparse
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index b0c4fca92..272b7979c 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import json
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index da258f98e..9ec270bbd 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import copy
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
index 7e9f5a7fc..b4f3c6bf9 100644
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Cutlass bench utils
 from collections.abc import Iterable
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 08e93837f..cec422e8d 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import copy
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
index d31b623a1..25b96ef56 100644
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index fce156e1c..f62d8102e 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py
index fd19b40bf..b1df2f255 100644
--- a/benchmarks/disagg_benchmarks/round_robin_proxy.py
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import itertools
diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
index 484d0cb3c..74fa56d07 100644
--- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
index 37a9173a1..901524214 100644
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pickle as pkl
 import time
diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py
index 36d03e40e..640a33419 100644
--- a/benchmarks/kernels/bench_fp8_gemm.py
+++ b/benchmarks/kernels/bench_fp8_gemm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import copy
 import itertools
diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
index e9934aa47..42de062b0 100644
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import sys
diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py
index d40ab70ec..97ee06034 100644
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
index d39d8a6e3..3383fb788 100644
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
 kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index 2197bceab..1be83b84e 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.utils.benchmark as benchmark
diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
index f21ca97ee..69978ec6b 100644
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 6c1284930..3d38d4b35 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import copy
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index f8f1db047..0f896f187 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import copy
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index b17baff2e..9ea1fddae 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.utils.benchmark as benchmark
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index c2f766085..6cb55b359 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import json
diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
index 333986fdf..dba1f3943 100644
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 from typing import Any, TypedDict
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 54f05e723..7e0376c18 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 import time
diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
index 2463dfebe..6ab26f5f1 100644
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 
diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
index d720083b6..4cf633a81 100644
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from typing import Optional, Union
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 944024ca3..b81baf17a 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import accumulate
 from typing import Optional
diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py
index 70190ba24..18c459c31 100644
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 WEIGHT_SHAPES = {
     "ideal": [[4 * 256 * 32, 256 * 32]],
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index 6315c1ee6..4fcdbadd6 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from sglang quantization/tuning_block_wise_kernel.py
 
 import argparse
diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
index e37764825..e67ce0545 100644
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # fmt: off
 # ruff: noqa: E501
 import time
diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py
index 0c86e4072..9a4da0ef5 100644
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 import pickle
diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py
index 877a29fee..4bbb36bb4 100644
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from collections.abc import Iterable
diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py
index afe159ddd..a27f02394 100644
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py
index d5701a8fb..0957a9c65 100644
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import cProfile
 import pstats
diff --git a/cmake/hipify.py b/cmake/hipify.py
index a15577125..55d378f5b 100755
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 #
 # A command line tool for running pytorch's hipify preprocessor on CUDA
diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
index d64f0d0a5..1dd7101ac 100644
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 from typing import Union
diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py
index 15f008d4f..49f33718a 100644
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import glob
 import itertools
 import os
diff --git a/csrc/quantization/gptq_marlin/generate_kernels.py b/csrc/quantization/gptq_marlin/generate_kernels.py
index 4ac7121ab..18fb6c1a8 100644
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import glob
 import itertools
 import os
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index 3114e14ba..9af7833d0 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 import math
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 6f290efe4..7cfc89605 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 from dataclasses import dataclass, field
 from pathlib import Path
diff --git a/docs/mkdocs/hooks/remove_announcement.py b/docs/mkdocs/hooks/remove_announcement.py
index e5f8549d8..f67941d2a 100644
--- a/docs/mkdocs/hooks/remove_announcement.py
+++ b/docs/mkdocs/hooks/remove_announcement.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from typing import Literal
 
diff --git a/docs/mkdocs/hooks/url_schemes.py b/docs/mkdocs/hooks/url_schemes.py
index c73882808..6484581ed 100644
--- a/docs/mkdocs/hooks/url_schemes.py
+++ b/docs/mkdocs/hooks/url_schemes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import regex as re
 from mkdocs.config.defaults import MkDocsConfig
 from mkdocs.structure.files import Files
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 56cdd6861..8e5cac78a 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference
 with the correct prompt format on audio language models.
diff --git a/examples/offline_inference/automatic_prefix_caching.py b/examples/offline_inference/automatic_prefix_caching.py
index 0d8c73304..a01a9565a 100644
--- a/examples/offline_inference/automatic_prefix_caching.py
+++ b/examples/offline_inference/automatic_prefix_caching.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Demonstration script for Automatic Prefix Caching (APC) in vLLM.
 
diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py
index ae5ae7cb4..78bfda9bc 100644
--- a/examples/offline_inference/basic/basic.py
+++ b/examples/offline_inference/basic/basic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 
diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py
index b0bb5aa71..d078c517d 100644
--- a/examples/offline_inference/basic/chat.py
+++ b/examples/offline_inference/basic/chat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py
index 40ccb1294..219064e97 100644
--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import Namespace
 
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
index 38a73ccca..fc5ca2378 100644
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import Namespace
 
diff --git a/examples/offline_inference/basic/generate.py b/examples/offline_inference/basic/generate.py
index 72f4a8208..6a41ef4d8 100644
--- a/examples/offline_inference/basic/generate.py
+++ b/examples/offline_inference/basic/generate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
index 3da73c6c4..6a08de2d2 100644
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import Namespace
 
diff --git a/examples/offline_inference/batch_llm_inference.py b/examples/offline_inference/batch_llm_inference.py
index c1edfb52f..b1c1ef620 100644
--- a/examples/offline_inference/batch_llm_inference.py
+++ b/examples/offline_inference/batch_llm_inference.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use Ray Data for data parallel batch inference.
 
diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py
index 61230d895..6e56e24f2 100644
--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # ruff: noqa
 import json
diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py
index 1a70446c3..8d7666418 100644
--- a/examples/offline_inference/context_extension.py
+++ b/examples/offline_inference/context_extension.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index 15906e1a2..3eccb4e11 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Usage:
 Single node:
diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
index 4ae5d3310..8f3d1a5c0 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
index 5757a8a84..0bfe7ec0e 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py
index 3ccab0dcd..05a361fee 100644
--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the example usage of disaggregated prefilling
 We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py
index 606ce7799..ce977ee99 100644
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json
 import os
diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py
index 23f60c431..e68128399 100644
--- a/examples/offline_inference/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import Namespace
 
diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py
index 59c0592ae..7f5d74d9a 100644
--- a/examples/offline_inference/embed_matryoshka_fy.py
+++ b/examples/offline_inference/embed_matryoshka_fy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import Namespace
 
diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
index 83dd1f667..0da6fa5c4 100644
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Demonstrate prompting of text-to-text
 encoder/decoder models, specifically BART
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index ae3737e37..d27a902ed 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference with
 the explicit/implicit prompt format on enc-dec LMMs for text generation.
diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
index 5d5e55a83..d7f2a1633 100644
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates using the `LLMEngine`
 for processing prompts with various sampling parameters.
diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py
index 5bb2327a3..cc78c0cbb 100644
--- a/examples/offline_inference/load_sharded_state.py
+++ b/examples/offline_inference/load_sharded_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Validates the loading of a model saved with the sharded_state format.
 This script demonstrates how to load a model that was previously saved
diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
index 33c660015..00d4cb9eb 100644
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use LoRA with different quantization techniques
 for offline inference.
diff --git a/examples/offline_inference/metrics.py b/examples/offline_inference/metrics.py
index 7927f758c..00fb3f5bc 100644
--- a/examples/offline_inference/metrics.py
+++ b/examples/offline_inference/metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Vector
diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index 98fef2648..330103d58 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # ruff: noqa
 import argparse
diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
index b750397f4..d5b1b4ad2 100644
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the usage of text generation with an LLM model,
 comparing the performance with and without speculative decoding.
diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index 1fa2f16f8..f0c00bcaa 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use the multi-LoRA functionality
 for offline inference.
diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py
index f2d7698f2..7826629a3 100644
--- a/examples/offline_inference/neuron.py
+++ b/examples/offline_inference/neuron.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 
diff --git a/examples/offline_inference/neuron_eagle.py b/examples/offline_inference/neuron_eagle.py
index 5d7fb819d..0b2070c8e 100644
--- a/examples/offline_inference/neuron_eagle.py
+++ b/examples/offline_inference/neuron_eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to run offline inference with an EAGLE speculative
 decoding model on neuron. To use EAGLE speculative decoding, you must use
diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py
index ec38525b9..c0ecfac50 100644
--- a/examples/offline_inference/neuron_int8_quantization.py
+++ b/examples/offline_inference/neuron_int8_quantization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/examples/offline_inference/neuron_multimodal.py b/examples/offline_inference/neuron_multimodal.py
index a9478650b..6ff8faabd 100644
--- a/examples/offline_inference/neuron_multimodal.py
+++ b/examples/offline_inference/neuron_multimodal.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import requests
 import torch
 from neuronx_distributed_inference.models.mllama.utils import add_instruct
diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py
index ecacbab77..2ef69f298 100644
--- a/examples/offline_inference/neuron_speculation.py
+++ b/examples/offline_inference/neuron_speculation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to run offline inference with a speculative
 decoding model on neuron.
diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py
index d3dad2495..699891382 100644
--- a/examples/offline_inference/prefix_caching.py
+++ b/examples/offline_inference/prefix_caching.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
index 21f7668ad..567c448a8 100644
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This is a demo script showing how to use the
 PrithviGeospatialMAE model with vLLM
diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py
index 244a64b89..392fba8fc 100644
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import inspect
 import json
diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py
index 82737d538..5200be826 100644
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import dataclasses
diff --git a/examples/offline_inference/prompt_embed_inference.py b/examples/offline_inference/prompt_embed_inference.py
index 9f6a60223..5d79222a1 100644
--- a/examples/offline_inference/prompt_embed_inference.py
+++ b/examples/offline_inference/prompt_embed_inference.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Demonstrates how to generate prompt embeddings using
 Hugging Face Transformers  and use them as input to vLLM
diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py
index 6482490d1..62effd5c8 100644
--- a/examples/offline_inference/qwen2_5_omni/only_thinker.py
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference
 with the correct prompt format on Qwen2.5-Omni (thinker only).
diff --git a/examples/offline_inference/qwen_1m.py b/examples/offline_inference/qwen_1m.py
index 856a35b0e..d8d61667f 100644
--- a/examples/offline_inference/qwen_1m.py
+++ b/examples/offline_inference/qwen_1m.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from urllib.request import urlopen
 
diff --git a/examples/offline_inference/reproducibility.py b/examples/offline_inference/reproducibility.py
index 6d048986e..d909438b4 100644
--- a/examples/offline_inference/reproducibility.py
+++ b/examples/offline_inference/reproducibility.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Demonstrates how to achieve reproducibility in vLLM.
 
diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index a8f6977e2..c6e63531a 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 a simple demonstration of RLHF with vLLM, inspired by
 the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py
index 76eafdca1..096363e68 100644
--- a/examples/offline_inference/rlhf_colocate.py
+++ b/examples/offline_inference/rlhf_colocate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 a simple demonstration to show how to co-locate
 vLLM worker with training actors on the same GPUs,
diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py
index 3461af707..c445224d7 100644
--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
 
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index 860fe2b5f..9b154e370 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Saves each worker's model state dict directly to a checkpoint, which enables a
 fast load path for large tensor-parallel models where each worker only needs to
diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py
index d583110c8..46858fffa 100644
--- a/examples/offline_inference/simple_profiling.py
+++ b/examples/offline_inference/simple_profiling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import time
diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py
index 9ed729960..8ef121ebe 100644
--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the example usage of guided decoding
 to generate structured outputs using vLLM. It shows how to apply
diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py
index 2fa49c083..3d3d7946c 100644
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 experimental support for tensor-parallel inference with torchrun,
 see https://github.com/vllm-project/vllm/issues/11400 for
diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
index f3c2859d4..9776f4fe3 100644
--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import os
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 2ef87f4f4..15dbd9f44 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference with
 the correct prompt format on vision language models for text generation.
diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
index cee02d06c..1f5bd4ad7 100644
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference with
 the correct prompt format on vision language models for multimodal embedding.
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 7ce28c5a4..de6365c0d 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference with
 multi-image input on vision language models for text generation,
diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py
index cc190e91c..84854911b 100644
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Example Python client for `vllm.entrypoints.api_server`
 Start the demo server:
     python -m vllm.entrypoints.api_server --model <model_name>
diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py
index e57b94e88..63c9ff9e9 100644
--- a/examples/online_serving/cohere_rerank_client.py
+++ b/examples/online_serving/cohere_rerank_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Example of using the OpenAI entrypoint's rerank API which is compatible with
 the Cohere SDK: https://github.com/cohere-ai/cohere-python
diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
index 2ffba4a7e..16c32dcaa 100644
--- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
+++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file provides a disaggregated prefilling proxy demo to demonstrate an
 example usage of XpYd disaggregated prefilling.
diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
index 3f2a3d01b..d5d0a07a2 100644
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Example for starting a Gradio OpenAI Chatbot Webserver
 Start vLLM API server:
     vllm serve meta-llama/Llama-2-7b-chat-hf
diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py
index fd341ff49..86d9ceb48 100644
--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Example for starting a Gradio Webserver
 Start vLLM API server:
     python -m vllm.entrypoints.api_server \
diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py
index 7eb3d2193..908d6a924 100644
--- a/examples/online_serving/jinaai_rerank_client.py
+++ b/examples/online_serving/jinaai_rerank_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Example of using the OpenAI entrypoint's rerank API which is compatible with
 Jina and Cohere https://jina.ai/reranker
diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py
index 65d74dcca..584db53db 100644
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional, Union
 
 import msgspec
diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py
index 2856e3be3..def95deb0 100644
--- a/examples/online_serving/openai_chat_completion_client.py
+++ b/examples/online_serving/openai_chat_completion_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Example Python client for OpenAI Chat Completion using vLLM API server
 NOTE: start a supported chat completion model server with `vllm serve`, e.g.
     vllm serve meta-llama/Llama-2-7b-chat-hf
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index 8c3c6ecdd..c99b5148d 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """An example showing how to use vLLM to serve multimodal models
 and run online serving with OpenAI client.
 
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
index a0d7841f6..41dbb3236 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
 options enabled. For example:
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
index 45c4232fe..7eb866821 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 To run this example, you can start the vLLM server
 without any specific flags:
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
index a4134ea43..64379083d 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 To run this example, you need to start the vLLM server:
 
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
index c73208abe..ec7d8b954 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from openai import OpenAI
 
 # This example demonstrates the `structural_tag` response format.
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
index 1ca61a8d5..bfbee7513 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 An example shows how to generate structured outputs from reasoning models
 like DeepSeekR1. The thinking process will not be guided by the JSON
diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
index a5febad45..4006d07f7 100644
--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 An example demonstrates how to use tool calling with reasoning models 
 like QwQ-32B. The reasoning_content will not be parsed by the tool 
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index f6b808211..932dbeb2e 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index f984fbabf..5a9192977 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
index ee519e555..70f3c2f19 100644
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import base64
diff --git a/examples/online_serving/openai_classification_client.py b/examples/online_serving/openai_classification_client.py
index 649cfa5d6..b10e7acbd 100644
--- a/examples/online_serving/openai_classification_client.py
+++ b/examples/online_serving/openai_classification_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import pprint
diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
index b1d21b5e4..df6e4e942 100644
--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 
diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
index 7891e14cb..2e0d168d6 100644
--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Example online usage of Score API.
 
diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py
index a055654e9..6bc390861 100644
--- a/examples/online_serving/openai_embedding_client.py
+++ b/examples/online_serving/openai_embedding_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from openai import OpenAI
 
diff --git a/examples/online_serving/openai_embedding_matryoshka_fy.py b/examples/online_serving/openai_embedding_matryoshka_fy.py
index 4544dcfb5..653da8d18 100644
--- a/examples/online_serving/openai_embedding_matryoshka_fy.py
+++ b/examples/online_serving/openai_embedding_matryoshka_fy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Example Python client for embedding API dimensions using vLLM API server
 NOTE:
     start a supported Matryoshka Embeddings model server with `vllm serve`, e.g.
diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
index 2620a1232..8252b3670 100644
--- a/examples/online_serving/openai_pooling_client.py
+++ b/examples/online_serving/openai_pooling_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Example online usage of Pooling API.
 
diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
index eb501ae72..12d45de3c 100644
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import json
 
diff --git a/examples/online_serving/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py
index 33d365f0c..018d986ad 100644
--- a/examples/online_serving/opentelemetry/dummy_client.py
+++ b/examples/online_serving/opentelemetry/dummy_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import requests
 from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py
index 85ea23407..3a9042138 100644
--- a/examples/online_serving/prompt_embed_inference_with_openai_client.py
+++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 vLLM OpenAI-Compatible Client with Prompt Embeddings
 
diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/online_serving/ray_serve_deepseek.py
index a76020130..9471563dd 100644
--- a/examples/online_serving/ray_serve_deepseek.py
+++ b/examples/online_serving/ray_serve_deepseek.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
 See more details at:
diff --git a/examples/online_serving/retrieval_augmented_generation_with_langchain.py b/examples/online_serving/retrieval_augmented_generation_with_langchain.py
index 37af3b388..d9a4cadb0 100644
--- a/examples/online_serving/retrieval_augmented_generation_with_langchain.py
+++ b/examples/online_serving/retrieval_augmented_generation_with_langchain.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Retrieval Augmented Generation (RAG) Implementation with Langchain
 ==================================================================
diff --git a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
index 08796b1b3..be4796acd 100644
--- a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
+++ b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 RAG (Retrieval Augmented Generation) Implementation with LlamaIndex
 ================================================================
diff --git a/examples/online_serving/streamlit_openai_chatbot_webserver.py b/examples/online_serving/streamlit_openai_chatbot_webserver.py
index 0722aa671..dab56172e 100644
--- a/examples/online_serving/streamlit_openai_chatbot_webserver.py
+++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 vLLM Chat Assistant - A Streamlit Web Interface
 
diff --git a/examples/online_serving/utils.py b/examples/online_serving/utils.py
index 0781a27f1..a512d8a31 100644
--- a/examples/online_serving/utils.py
+++ b/examples/online_serving/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from openai import APIConnectionError, OpenAI
 from openai.pagination import SyncPage
 from openai.types.model import Model
diff --git a/examples/others/lmcache/cpu_offload_lmcache.py b/examples/others/lmcache/cpu_offload_lmcache.py
index 98eafb31e..354e4cc8c 100644
--- a/examples/others/lmcache/cpu_offload_lmcache.py
+++ b/examples/others/lmcache/cpu_offload_lmcache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the example usage of cpu offloading
 with LMCache in vLLM v1 or v0.
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v0.py b/examples/others/lmcache/disagg_prefill_lmcache_v0.py
index b2b7b3b2c..6669eb3fb 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v0.py
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v0.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the example usage of disaggregated prefilling
 with LMCache.
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
index 20155c203..5d8e38c73 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import os
diff --git a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
index 89945d67a..508cf4a5a 100644
--- a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
+++ b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the example usage of remote KV cache sharing
 with LMCache.
diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py
index 175777630..9e1003a5c 100644
--- a/examples/others/tensorize_vllm_model.py
+++ b/examples/others/tensorize_vllm_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import dataclasses
diff --git a/find_cuda_init.py b/find_cuda_init.py
index 0d13b2f86..308fc6fc2 100644
--- a/find_cuda_init.py
+++ b/find_cuda_init.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import importlib
 import traceback
diff --git a/setup.py b/setup.py
index c190864dd..b07cdea30 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ctypes
 import importlib.util
diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
index 1e3c2d1a4..163185b90 100644
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """vllm.entrypoints.api_server with some extra logging for testing."""
 from collections.abc import Iterable
 from typing import Any
diff --git a/tests/async_engine/conftest.py b/tests/async_engine/conftest.py
index 1a20e2c13..375b248eb 100644
--- a/tests/async_engine/conftest.py
+++ b/tests/async_engine/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 410cece79..38ecaf223 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import subprocess
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index b6f448714..1a31bdbfc 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
index fd6d89d4e..1851eeeda 100644
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 11c8e7a4b..46be4a3c3 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the short outputs of HF and vLLM when using greedy sampling.
 
 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 06c9e25ed..eb5b09ff7 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the outputs of HF and vLLM when using greedy sampling.
 
 It tests chunked prefill. Chunked prefill can be enabled by
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index be3ad1239..28bfe9e7c 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from ..utils import compare_two_settings
 
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 76b266aad..34f9389c8 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 63dc0f8c8..341a39a42 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the short outputs of HF and vLLM when using greedy sampling.
 
 VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
diff --git a/tests/benchmarks/test_latency_cli.py b/tests/benchmarks/test_latency_cli.py
index 8537459b9..2279c846e 100644
--- a/tests/benchmarks/test_latency_cli.py
+++ b/tests/benchmarks/test_latency_cli.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 
 import pytest
diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
index b746d6b78..a31819526 100644
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 
 import pytest
diff --git a/tests/benchmarks/test_throughput_cli.py b/tests/benchmarks/test_throughput_cli.py
index 2045b3629..b61e51db4 100644
--- a/tests/benchmarks/test_throughput_cli.py
+++ b/tests/benchmarks/test_throughput_cli.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 
 import pytest
diff --git a/tests/build_cython.py b/tests/build_cython.py
index 9dea6bcd6..f4a334aa3 100644
--- a/tests/build_cython.py
+++ b/tests/build_cython.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import Cython.Compiler.Options
 from Cython.Build import cythonize
 from setuptools import setup
diff --git a/tests/compile/backend.py b/tests/compile/backend.py
index 5a02c4e2b..60334f5e4 100644
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import deepcopy
 from typing import Callable, Union
diff --git a/tests/compile/conftest.py b/tests/compile/conftest.py
index 7118810a5..d86ca3710 100644
--- a/tests/compile/conftest.py
+++ b/tests/compile/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py
index a71a40cda..3188ea40f 100644
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import os
 
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 5ce520a44..852aa44d4 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test the piecewise compilation with a simple model so that we
 can exactly calculate the expected output and side effects.
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 22560befc..2464d7889 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test the piecewise compilation with a simple model, comparing the output
 with and without the piecewise compilation.
diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py
index 8e4e0ba83..1e4ee571f 100644
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index b6b45d1cb..dc6cfe9da 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import dataclasses
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 397517b86..1d000fe00 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 5d38ff914..aade29b99 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 509593e73..0c25aae52 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index b630d0e85..251cc46e9 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 
 import pytest
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
index 2cd7ebaac..c689befdf 100644
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index 9eae48d60..df36b86ab 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py
index 0934c6113..5e39f6821 100644
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 6336c6c2c..5ec3926bd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import os
 import tempfile
diff --git a/tests/core/block/conftest.py b/tests/core/block/conftest.py
index b7a9863f4..6afe98d78 100644
--- a/tests/core/block/conftest.py
+++ b/tests/core/block/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py
index 83259b690..e2c6c66b2 100644
--- a/tests/core/block/e2e/conftest.py
+++ b/tests/core/block/e2e/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Callable, Optional
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 9e8e315d8..f296c81e1 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import cycle
 
diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
index 039b5e739..3429a858d 100644
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py
index 68d9618ae..9eed264fd 100644
--- a/tests/core/block/test_block_manager.py
+++ b/tests/core/block/test_block_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
index 250c9a749..ba0850011 100644
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/core/block/test_common.py b/tests/core/block/test_common.py
index 202608730..65400899b 100644
--- a/tests/core/block/test_common.py
+++ b/tests/core/block/test_common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py
index a1414edd9..795eef674 100644
--- a/tests/core/block/test_cpu_gpu_block_allocator.py
+++ b/tests/core/block/test_cpu_gpu_block_allocator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py
index 4b9454c84..a31d1c46b 100644
--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index 50233624f..46e224c6f 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 import random
diff --git a/tests/core/conftest.py b/tests/core/conftest.py
index 1a20e2c13..375b248eb 100644
--- a/tests/core/conftest.py
+++ b/tests/core/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index 161b32f01..d4dacc4f1 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import MagicMock
 
diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py
index a4a901444..1b958e34d 100644
--- a/tests/core/test_num_computed_tokens_update.py
+++ b/tests/core/test_num_computed_tokens_update.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index a5ba16898..db78a9d55 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from collections import deque
diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py
index c6049b26a..20cc083ec 100644
--- a/tests/core/test_scheduler_encoder_decoder.py
+++ b/tests/core/test_scheduler_encoder_decoder.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest  # noqa
 
diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py
index 64b3e148e..8281298d6 100644
--- a/tests/core/test_serialization.py
+++ b/tests/core/test_serialization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import msgspec
 
diff --git a/tests/core/utils.py b/tests/core/utils.py
index 84b0426b4..b746c1786 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from collections import defaultdict
diff --git a/tests/detokenizer/conftest.py b/tests/detokenizer/conftest.py
index 59394b035..f2c125355 100644
--- a/tests/detokenizer/conftest.py
+++ b/tests/detokenizer/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/detokenizer/test_disable_detokenization.py b/tests/detokenizer/test_disable_detokenization.py
index 14f9babb8..ae06a985c 100644
--- a/tests/detokenizer/test_disable_detokenization.py
+++ b/tests/detokenizer/test_disable_detokenization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/detokenizer/test_stop_checker.py b/tests/detokenizer/test_stop_checker.py
index e9ad8d161..bd2219772 100644
--- a/tests/detokenizer/test_stop_checker.py
+++ b/tests/detokenizer/test_stop_checker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import MagicMock
 
diff --git a/tests/detokenizer/test_stop_reason.py b/tests/detokenizer/test_stop_reason.py
index 4b1e4f5cf..9716f7d72 100644
--- a/tests/detokenizer/test_stop_reason.py
+++ b/tests/detokenizer/test_stop_reason.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test the different finish_reason="stop" situations during generation:
     1. One of the provided stop strings
     2. One of the provided stop tokens
diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py
index 0607dd01a..efe938a20 100644
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py
index ee8f20979..95f085788 100644
--- a/tests/distributed/conftest.py
+++ b/tests/distributed/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
 from typing import Optional, Union
 
diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py
index 72e7ebdb7..e2de46261 100644
--- a/tests/distributed/test_ca_buffer_sharing.py
+++ b/tests/distributed/test_ca_buffer_sharing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # can only run on machines with p2p access across GPUs
 # can only run with torchrun:
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index 8f4c3537e..e2cb579e2 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test the communication operators.
 
 Run `pytest tests/distributed/test_comm_ops.py`.
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index a7ba45c9e..fae49c41d 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/distributed/test_distributed_oot.py b/tests/distributed/test_distributed_oot.py
index 4b0c65d1d..b93696e4b 100644
--- a/tests/distributed/test_distributed_oot.py
+++ b/tests/distributed/test_distributed_oot.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from ..entrypoints.openai.test_oot_registration import (
     run_and_test_dummy_opt_api_server)
diff --git a/tests/distributed/test_events.py b/tests/distributed/test_events.py
index 8de1aa20e..ec1e5a2d6 100644
--- a/tests/distributed/test_events.py
+++ b/tests/distributed/test_events.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import threading
 import time
 
diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py
index db8281617..f641bf160 100644
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Literal, NamedTuple, Optional
diff --git a/tests/distributed/test_multi_node_assignment.py b/tests/distributed/test_multi_node_assignment.py
index c86d2d8a0..ef17a51ff 100644
--- a/tests/distributed/test_multi_node_assignment.py
+++ b/tests/distributed/test_multi_node_assignment.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Make sure ray assigns GPU workers to the correct node.
 
 Run:
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index e6410ab06..7d569fd83 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 WARNING: This test runs in both single-node (4 GPUs) and multi-node
  (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py
index 7bf93f270..69ceedd34 100644
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index 3ca6e7b33..a027a9e37 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 2c323edfa..5b32b90f3 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import multiprocessing
 import os
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index 9b1bbd6e5..94ad8f4f1 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index c9eba2b43..91a594eac 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 WARNING: This test runs in both single-node (4 GPUs) and multi-node
  (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index f9eacc11d..e1357b4a3 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import multiprocessing
 import random
diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
index bb38e908b..9f2c3eaec 100644
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # unit test for `examples/offline_inference/torchrun_example.py`
 import os
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 4432950f2..0287ad94e 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import socket
 
diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py
index 0f46fba3a..8b99d9d6e 100644
--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """E2E tests to verify the correctness of the encoder-decoder framework
 
 Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
diff --git a/tests/engine/conftest.py b/tests/engine/conftest.py
index 1a20e2c13..375b248eb 100644
--- a/tests/engine/conftest.py
+++ b/tests/engine/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 05d9cfc7a..ab78aa7da 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from argparse import ArgumentError, ArgumentTypeError
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
index 049fa2c8b..ac5a1f957 100644
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py
index 91c9ba4a7..15c7a97b5 100644
--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
diff --git a/tests/engine/test_multi_step_output_processor.py b/tests/engine/test_multi_step_output_processor.py
index b67dd86bf..458f4deb7 100644
--- a/tests/engine/test_multi_step_output_processor.py
+++ b/tests/engine/test_multi_step_output_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from unittest.mock import MagicMock
diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py
index 9b2f45def..b5381b61a 100644
--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
diff --git a/tests/engine/test_options.py b/tests/engine/test_options.py
index 0cf4f69d5..fc6a78a51 100644
--- a/tests/engine/test_options.py
+++ b/tests/engine/test_options.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import nullcontext
 
 import pytest
diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py
index b29d6362f..9c62761d7 100644
--- a/tests/engine/test_short_mm_context.py
+++ b/tests/engine/test_short_mm_context.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index 3b596ea3e..a7c533ec2 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 95657455b..a2d35486a 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file test accuracy of the vLLM server via LMEval.
 It uses local-completions, which interacts with vLLM
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index 742a66683..97cf3b5ce 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import weakref
 
 import pytest
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
index 6470249dd..3a13f8c97 100644
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index d10257761..f0fa54aa3 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
 
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
index 9a895c922..4676dc992 100644
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
 
diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
index 099af0f36..b7d53e31f 100644
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
 
diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py
index c2b4a9358..533da9e6d 100644
--- a/tests/entrypoints/llm/test_gpu_utilization.py
+++ b/tests/entrypoints/llm/test_gpu_utilization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index dd5d17885..d41b0a436 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import weakref
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index f065f6564..61b6b4fbf 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import sys
 from contextlib import nullcontext
diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py
index 665c6ea1e..1b7be15d5 100644
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index 23fd72f4e..a606eeab5 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for HF_HUB_OFFLINE mode"""
 import importlib
 import sys
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index d3948e2ed..41b70f80e 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file test accuracy of the vLLM server via LMEval.
 It uses local-completions, which interacts with vLLM
diff --git a/tests/entrypoints/openai/correctness/test_mteb.py b/tests/entrypoints/openai/correctness/test_mteb.py
index 44d7ac193..437c48511 100644
--- a/tests/entrypoints/openai/correctness/test_mteb.py
+++ b/tests/entrypoints/openai/correctness/test_mteb.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import pytest
diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
index 642c204b9..58195f98b 100644
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Evaluate Transcription API correctness by computing Word Error Rate (WER)
 on a given ASR dataset. When provided, it will also compare the WER against
diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py
index 1f7ba0da4..ab3c80905 100644
--- a/tests/entrypoints/openai/test_async_tokenization.py
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import contextlib
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 7f959f312..d67c05ab3 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index a4ac80070..a55941976 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from http import HTTPStatus
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 2509ef0d2..dab947b21 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # imports for guided decoding tests
 import json
diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py
index 86ee17c6f..de63f4ed2 100644
--- a/tests/entrypoints/openai/test_chat_echo.py
+++ b/tests/entrypoints/openai/test_chat_echo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import NamedTuple
 
diff --git a/tests/entrypoints/openai/test_chat_logit_bias_validation.py b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
index 9dab524ea..e9d1a8552 100644
--- a/tests/entrypoints/openai/test_chat_logit_bias_validation.py
+++ b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index f18fbb0a9..daa4a78c9 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
index e00f001ef..03730b672 100644
--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
 import pytest
diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py
index 0419395f1..3c8ed955a 100644
--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
 import pytest
diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py
index 97124c85e..6d5f92515 100644
--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/openai/test_classification.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import requests
diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index 8d1abe28a..504fd72aa 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 9d12f27a2..7e54143f6 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # imports for guided decoding tests
 import json
 import shutil
diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index dad76b54c..dbea2dc0b 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
 import pytest
diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
index b7ee3e33c..00d3ffb61 100644
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 import io
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 81ca65b65..80640a2e1 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 
diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py
index 341defae0..08b797dc5 100644
--- a/tests/entrypoints/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
 """
diff --git a/tests/entrypoints/openai/test_encoder_decoder.py b/tests/entrypoints/openai/test_encoder_decoder.py
index 52b4df9ce..9c2aef23e 100644
--- a/tests/entrypoints/openai/test_encoder_decoder.py
+++ b/tests/entrypoints/openai/test_encoder_decoder.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index cd07ca46c..bcdeaaace 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import json
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index c96151349..d4afdf775 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import suppress
 from dataclasses import dataclass, field
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index b21c0173c..2d7b84573 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import subprocess
 import sys
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index 3d4f1cde2..1980daa80 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
 import pytest
diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py
index a1b7a205a..f0ce50deb 100644
--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from ...utils import VLLM_PATH, RemoteOpenAIServer
 
diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index cae2a3b59..4ded37595 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Final
 
 import pytest
diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py
index 72ab12c56..cf16ace65 100644
--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/openai/test_pooling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index e38491589..ff0730c77 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # imports for guided decoding tests
 import openai
diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py
index ba11cd3a2..19eba320c 100644
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import requests
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index 647485864..099062e55 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Separate these tests out from test_completion and test_chat, because they
 # require launching a second server with a different flag. Running both servers
diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py
index 106d6b2c1..7b4966848 100644
--- a/tests/entrypoints/openai/test_root_path.py
+++ b/tests/entrypoints/openai/test_root_path.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import os
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index 99639ce51..e23f41e98 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import subprocess
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index b373f2912..af51a0a3e 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 
 import pytest
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 5e11af8cf..94740fefc 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from contextlib import suppress
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index e8f3c2f8b..28af6489a 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from http import HTTPStatus
 from unittest.mock import MagicMock
diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py
index 0f12ac9b2..29a94c852 100644
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py
index 3ca8a9a41..0dd6af17e 100644
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import requests
 
diff --git a/tests/entrypoints/openai/test_tensorizer_entrypoint.py b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
index f1ab72230..e14315035 100644
--- a/tests/entrypoints/openai/test_tensorizer_entrypoint.py
+++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
 import json
 import tempfile
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 7d823542e..57dd25fe1 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import pytest_asyncio
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 5c48df3ce..1cb0a39df 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # imports for guided decoding tests
 import io
diff --git a/tests/entrypoints/openai/test_truncation.py b/tests/entrypoints/openai/test_truncation.py
index 137ed9db8..b33a26af6 100644
--- a/tests/entrypoints/openai/test_truncation.py
+++ b/tests/entrypoints/openai/test_truncation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 
 import openai
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index 53f057a29..990ea3579 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 1ab50b41c..4513d8b34 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 26c68e06c..fe982e286 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
index f5f327ea0..8c86b4889 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import MagicMock, patch
 
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
index 71f41ea7d..d83137472 100644
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import MagicMock, patch
 
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
index ab8f4bd67..e1b41f45f 100644
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Union
diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py
index 0dd1fdd99..e4af60a78 100644
--- a/tests/entrypoints/test_api_server_process_manager.py
+++ b/tests/entrypoints/test_api_server_process_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import multiprocessing
 import socket
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 9f1f2321d..492946642 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
 from typing import Optional
diff --git a/tests/entrypoints/test_ssl_cert_refresher.py b/tests/entrypoints/test_ssl_cert_refresher.py
index 23ce7a679..33ad2cfd3 100644
--- a/tests/entrypoints/test_ssl_cert_refresher.py
+++ b/tests/entrypoints/test_ssl_cert_refresher.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import tempfile
 from pathlib import Path
diff --git a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py b/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
index 184bee2a7..1b95bf59f 100644
--- a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
+++ b/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import SamplingParams
 from vllm.config import LoadFormat
diff --git a/tests/fastsafetensors_loader/test_weight_utils.py b/tests/fastsafetensors_loader/test_weight_utils.py
index 8772035af..78d23acfe 100644
--- a/tests/fastsafetensors_loader/test_weight_utils.py
+++ b/tests/fastsafetensors_loader/test_weight_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import glob
 import tempfile
diff --git a/tests/kernels/allclose_default.py b/tests/kernels/allclose_default.py
index 97ceffab4..9d65159bf 100644
--- a/tests/kernels/allclose_default.py
+++ b/tests/kernels/allclose_default.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/kernels/attention/conftest.py b/tests/kernels/attention/conftest.py
index 4f04ec947..88a2fb62b 100644
--- a/tests/kernels/attention/conftest.py
+++ b/tests/kernels/attention/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index d9f956fbc..2d381a99b 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from typing import Optional
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 58da01f0e..435fe6225 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import patch
 
diff --git a/tests/kernels/attention/test_blocksparse_attention.py b/tests/kernels/attention/test_blocksparse_attention.py
index 82d038257..9aee818c9 100644
--- a/tests/kernels/attention/test_blocksparse_attention.py
+++ b/tests/kernels/attention/test_blocksparse_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from typing import Optional
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index 2f2212dd2..e508505c2 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py
index d6570e633..1e7e7e0a7 100755
--- a/tests/kernels/attention/test_cascade_flash_attn.py
+++ b/tests/kernels/attention/test_cascade_flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/kernels/attention/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py
index c8ee46bc6..c6ce7b0cc 100644
--- a/tests/kernels/attention/test_encoder_decoder_attn.py
+++ b/tests/kernels/attention/test_encoder_decoder_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Tests:
 
diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
index 88516b75c..bd3190d09 100644
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
index 5ad1137aa..3ad6e1d32 100644
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/kernels/attention/test_flashmla.py b/tests/kernels/attention/test_flashmla.py
index 0d51a8e7f..21b08e45f 100644
--- a/tests/kernels/attention/test_flashmla.py
+++ b/tests/kernels/attention/test_flashmla.py
@@ -1,5 +1,6 @@
 # Adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/tests/test_flash_mla.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 import random
 
diff --git a/tests/kernels/attention/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py
index fbad52987..de45ee1ed 100644
--- a/tests/kernels/attention/test_lightning_attn.py
+++ b/tests/kernels/attention/test_lightning_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py
index 7038fbea5..9d1a301eb 100644
--- a/tests/kernels/attention/test_merge_attn_states.py
+++ b/tests/kernels/attention/test_merge_attn_states.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import pytest
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index 5a18b7916..53c37554b 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test:
 
diff --git a/tests/kernels/attention/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py
index 8cebe32c4..5a7480a6b 100644
--- a/tests/kernels/attention/test_mla_decode_cpu.py
+++ b/tests/kernels/attention/test_mla_decode_cpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 import torch.nn.functional as F
diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
index 9333777d3..b09e1bbc4 100644
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 import random
diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py
index 6ffe27abf..ed58880cc 100644
--- a/tests/kernels/attention/test_rocm_attention_selector.py
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/attention/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py
index fd3c9fa41..358b374ea 100644
--- a/tests/kernels/attention/test_triton_decode_attention.py
+++ b/tests/kernels/attention/test_triton_decode_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py
index be3d1879d..0cb7f5963 100644
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py
index 79f838a95..29c5e70a8 100644
--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py
index 7a591f536..19703b8a2 100644
--- a/tests/kernels/core/test_fused_quant_layernorm.py
+++ b/tests/kernels/core/test_fused_quant_layernorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py
index fa4bbe458..3eac06273 100644
--- a/tests/kernels/core/test_layernorm.py
+++ b/tests/kernels/core/test_layernorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/core/test_opcheck.py b/tests/kernels/core/test_opcheck.py
index c9a9679c5..40ced08b9 100644
--- a/tests/kernels/core/test_opcheck.py
+++ b/tests/kernels/core/test_opcheck.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Tests for miscellaneous utilities
 """
diff --git a/tests/kernels/core/test_permute_cols.py b/tests/kernels/core/test_permute_cols.py
index 35d62079f..e18f6230d 100644
--- a/tests/kernels/core/test_permute_cols.py
+++ b/tests/kernels/core/test_permute_cols.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
index 8cb56314c..ab6f1ccf8 100644
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import accumulate, product
 from typing import Callable, Optional
diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py
index 8383f943b..db0fdcbf5 100644
--- a/tests/kernels/core/test_rotary_embedding.py
+++ b/tests/kernels/core/test_rotary_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Tests for miscellaneous utilities
 """
diff --git a/tests/kernels/core/test_uva.py b/tests/kernels/core/test_uva.py
index f641ae7b6..c71215e4c 100644
--- a/tests/kernels/core/test_uva.py
+++ b/tests/kernels/core/test_uva.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/mamba/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py
index 93064e23d..addb8bfcd 100644
--- a/tests/kernels/mamba/test_causal_conv1d.py
+++ b/tests/kernels/mamba/test_causal_conv1d.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py
index abcf3888f..f5c6a1861 100644
--- a/tests/kernels/mamba/test_mamba_mixer2.py
+++ b/tests/kernels/mamba/test_mamba_mixer2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import unittest
 
diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py
index 84d4c347e..8dece26dd 100644
--- a/tests/kernels/mamba/test_mamba_ssm.py
+++ b/tests/kernels/mamba/test_mamba_ssm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py
index f5e751bea..abed1252a 100644
--- a/tests/kernels/mamba/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index 7d369edfc..b0e0feab4 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
index 7db4fe0f4..558288ba4 100644
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import dataclasses
 from typing import Optional
 
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 299279390..7238813a2 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the MOE layers.
 
 Run `pytest tests/kernels/test_moe.py`.
diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py
index 10e6ac64d..7cc83b512 100644
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the MOE permute/unpermute kernel
 
 Run `pytest tests/kernels/test_moe_permute_unpermute.py`.
diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py
index ae63b379f..be33200cc 100644
--- a/tests/kernels/moe/test_nvfp4_moe.py
+++ b/tests/kernels/moe/test_nvfp4_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index 8c4a2c3fa..95c10037b 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the MOE layers.
 
 Run `pytest tests/kernels/test_pplx_moe.py`.
diff --git a/tests/kernels/moe/test_rocm_aiter_topk.py b/tests/kernels/moe/test_rocm_aiter_topk.py
index 922fd66db..1c51c530c 100644
--- a/tests/kernels/moe/test_rocm_aiter_topk.py
+++ b/tests/kernels/moe/test_rocm_aiter_topk.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # This is a test for the AITER ops.
 # It tests if the AITER ops are
 # 1. correctly registered as custom ops
diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
index 3b5838a99..dfd0f35c8 100644
--- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
+++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_triton_moe_channel_fp8_kernel.py
 import itertools
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index 892309a01..0840cc7b5 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/tests/kernels/quantization/nvfp4_utils.py b/tests/kernels/quantization/nvfp4_utils.py
index 58eaeee1c..1095975ab 100644
--- a/tests/kernels/quantization/nvfp4_utils.py
+++ b/tests/kernels/quantization/nvfp4_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
 from vllm.scalar_type import scalar_types
diff --git a/tests/kernels/quantization/test_allspark_gemm.py b/tests/kernels/quantization/test_allspark_gemm.py
index 896e02657..3de9cb364 100644
--- a/tests/kernels/quantization/test_allspark_gemm.py
+++ b/tests/kernels/quantization/test_allspark_gemm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/quantization/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py
index 7d3617281..427db3e60 100644
--- a/tests/kernels/quantization/test_aqlm.py
+++ b/tests/kernels/quantization/test_aqlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/kernels/quantization/test_awq.py b/tests/kernels/quantization/test_awq.py
index 248b294e5..bc0868123 100644
--- a/tests/kernels/quantization/test_awq.py
+++ b/tests/kernels/quantization/test_awq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/quantization/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py
index 3fc3feaf4..96797e85b 100644
--- a/tests/kernels/quantization/test_awq_triton.py
+++ b/tests/kernels/quantization/test_awq_triton.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the AWQ Triton kernel.
 
 Run `pytest tests/kernels/test_awq_triton.py`.
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index ae05d6117..8c5ee9874 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/pull/2575
 import itertools
diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py
index a4e9f83f0..fa2c9f890 100644
--- a/tests/kernels/quantization/test_block_int8.py
+++ b/tests/kernels/quantization/test_block_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_block_int8.py
 import itertools
diff --git a/tests/kernels/quantization/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
index d67d2dbb8..878f66647 100644
--- a/tests/kernels/quantization/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for sparse cutlass kernels
 
 Run `pytest tests/kernels/test_semi_structured.py`.
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index 633addd42..51bb29df0 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for cutlass kernels
 
 Run `pytest tests/kernels/test_cutlass.py`.
diff --git a/tests/kernels/quantization/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py
index 876cf03fd..0a3edd4dd 100644
--- a/tests/kernels/quantization/test_fp8_quant.py
+++ b/tests/kernels/quantization/test_fp8_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/quantization/test_ggml.py b/tests/kernels/quantization/test_ggml.py
index 73697a6d1..07651fef3 100644
--- a/tests/kernels/quantization/test_ggml.py
+++ b/tests/kernels/quantization/test_ggml.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import gguf
 import pytest
diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py
index ad755fe7f..436d5cb64 100644
--- a/tests/kernels/quantization/test_gguf.py
+++ b/tests/kernels/quantization/test_gguf.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from pathlib import Path
 
diff --git a/tests/kernels/quantization/test_gptq.py b/tests/kernels/quantization/test_gptq.py
index fea013d9e..7fb57a157 100644
--- a/tests/kernels/quantization/test_gptq.py
+++ b/tests/kernels/quantization/test_gptq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/kernels/quantization/test_int8_kernel.py b/tests/kernels/quantization/test_int8_kernel.py
index 4c7543527..dc5fecbf4 100644
--- a/tests/kernels/quantization/test_int8_kernel.py
+++ b/tests/kernels/quantization/test_int8_kernel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_int8_kernel.py
 import itertools
diff --git a/tests/kernels/quantization/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py
index 25dcb587e..63ccf4a91 100644
--- a/tests/kernels/quantization/test_int8_quant.py
+++ b/tests/kernels/quantization/test_int8_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index 5aeaaa654..998171baa 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the machete kernel.
 
 Run `pytest tests/kernels/test_machete_mm.py`.
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index 52507b375..92914bd5c 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the marlin kernel.
 
 Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
diff --git a/tests/kernels/quantization/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py
index b8aa16721..3a8f4c175 100644
--- a/tests/kernels/quantization/test_nvfp4_quant.py
+++ b/tests/kernels/quantization/test_nvfp4_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/quantization/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
index 1f49900b2..0b45c2298 100644
--- a/tests/kernels/quantization/test_nvfp4_scaled_mm.py
+++ b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 from nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX,
diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py
index c7eee8998..533a4fe59 100644
--- a/tests/kernels/quantization/test_rocm_skinny_gemms.py
+++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
index 30e6eeb8d..8a2cc3bac 100644
--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the triton_scaled_mm kernel
 
 Run `pytest tests/kernels/test_triton_scaled_mm.py`.
diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py
index 87e4bd4b0..c56024b75 100644
--- a/tests/kernels/test_cutlass_mla_decode.py
+++ b/tests/kernels/test_cutlass_mla_decode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 import torch.nn.functional as F
diff --git a/tests/kernels/test_fused_quant_activation.py b/tests/kernels/test_fused_quant_activation.py
index faa8d49ce..803453a20 100644
--- a/tests/kernels/test_fused_quant_activation.py
+++ b/tests/kernels/test_fused_quant_activation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/test_triton_flash_attention.py b/tests/kernels/test_triton_flash_attention.py
index cf2bdc908..1c31cfb25 100644
--- a/tests/kernels/test_triton_flash_attention.py
+++ b/tests/kernels/test_triton_flash_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the triton_flash_attention kernel
 
 Run `pytest tests/kernels/test_triton_flash_attention.py`.
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 22b3d7c2b..d1db6a8eb 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Kernel test utils"""
 
 import itertools
diff --git a/tests/kv_transfer/test_disagg.py b/tests/kv_transfer/test_disagg.py
index dc948a48b..9f2229cc4 100644
--- a/tests/kv_transfer/test_disagg.py
+++ b/tests/kv_transfer/test_disagg.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import subprocess
diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index c5b34660d..352ab6355 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import random
diff --git a/tests/kv_transfer/test_module.py b/tests/kv_transfer/test_module.py
index 8a6490b5c..7a0417487 100644
--- a/tests/kv_transfer/test_module.py
+++ b/tests/kv_transfer/test_module.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import subprocess
 import sys
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 3dd923d24..32116608a 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import time
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 399311ce6..0737bb886 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import tempfile
 from collections import OrderedDict
diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index 17347300b..cc8160b28 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import time
 
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 007be7aa5..774ebb9db 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index cd9526c8b..5481b413b 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import vllm
 from vllm.lora.request import LoRARequest
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 0a8b38fa7..92db023ba 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from copy import deepcopy
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 54daea5b9..23819f03d 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 import sys
 from typing import Union
diff --git a/tests/lora/test_lora_allowed_token_ids.py b/tests/lora/test_lora_allowed_token_ids.py
index 094541aef..01bc102bd 100644
--- a/tests/lora/test_lora_allowed_token_ids.py
+++ b/tests/lora/test_lora_allowed_token_ids.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 02f2339be..ebc0f2637 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
index fd80f61a5..e9a52e1b6 100644
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Script to test add_lora, remove_lora, pin_lora, list_loras functions.
 """
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 90498c47f..b46d81f16 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 52b0834ca..8f8a27006 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 24242b8a1..99fe951bb 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 4e77c5559..0ea077933 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py
index 9935472ad..f16589e06 100644
--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import math
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index 7375cabbc..a21de0705 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py
index add313c94..14fa79ae5 100644
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from threading import Lock
 
 import pytest
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 43e2975cd..caa31fdb0 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 20a1ae67d..604bb307b 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 from typing import Optional
 
diff --git a/tests/lora/test_resolver.py b/tests/lora/test_resolver.py
index 8ebc2ae98..6c93e5776 100644
--- a/tests/lora/test_resolver.py
+++ b/tests/lora/test_resolver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py
index 8845eb33d..6cfdaf50d 100644
--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py
index 63907f2c1..5065a2fb7 100644
--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index 0d4e0bf68..b343bef0a 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import OrderedDict
 from typing import NamedTuple, Optional
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 1a5d52716..6f13e663a 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import random
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index 59a0e7420..cc1b0d819 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Optional, Union
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index e71c87ff3..7bb5d8980 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 
diff --git a/tests/mistral_tool_use/conftest.py b/tests/mistral_tool_use/conftest.py
index 39ab01c9b..e89e60c5a 100644
--- a/tests/mistral_tool_use/conftest.py
+++ b/tests/mistral_tool_use/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import pytest_asyncio
diff --git a/tests/mistral_tool_use/test_mistral_tool_calls.py b/tests/mistral_tool_use/test_mistral_tool_calls.py
index bbb3a0789..9bf6863f3 100644
--- a/tests/mistral_tool_use/test_mistral_tool_calls.py
+++ b/tests/mistral_tool_use/test_mistral_tool_calls.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/mistral_tool_use/utils.py b/tests/mistral_tool_use/utils.py
index 1d809a05e..7a026cd9b 100644
--- a/tests/mistral_tool_use/utils.py
+++ b/tests/mistral_tool_use/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/model_executor/conftest.py b/tests/model_executor/conftest.py
index b588a1a96..c6d89d849 100644
--- a/tests/model_executor/conftest.py
+++ b/tests/model_executor/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index e957db5b3..a94215ee3 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index 6cd966f84..ac31064d9 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import pickle
diff --git a/tests/model_executor/test_logits_processor.py b/tests/model_executor/test_logits_processor.py
index 8301c645b..532ebba03 100644
--- a/tests/model_executor/test_logits_processor.py
+++ b/tests/model_executor/test_logits_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from unittest.mock import patch
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index 7fda1f0e8..94a14bd24 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py
index bdaba22c3..df625b8d6 100644
--- a/tests/model_executor/test_weight_utils.py
+++ b/tests/model_executor/test_weight_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import tempfile
diff --git a/tests/models/language/generation/test_bart.py b/tests/models/language/generation/test_bart.py
index 8ab0167dc..7d8acab5e 100644
--- a/tests/models/language/generation/test_bart.py
+++ b/tests/models/language/generation/test_bart.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import pytest
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index 05dd18fbd..ed9e54722 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from typing import Optional
 
diff --git a/tests/models/language/generation/test_granite.py b/tests/models/language/generation/test_granite.py
index f381c34f4..2a39f78a7 100644
--- a/tests/models/language/generation/test_granite.py
+++ b/tests/models/language/generation/test_granite.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 from ...utils import check_logprobs_close
diff --git a/tests/models/language/generation/test_granitemoehybrid.py b/tests/models/language/generation/test_granitemoehybrid.py
index da3f5e110..952449f28 100644
--- a/tests/models/language/generation/test_granitemoehybrid.py
+++ b/tests/models/language/generation/test_granitemoehybrid.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 604cb854b..3eaadcb45 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index c1b612ae2..bdd857ff5 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 import json
 
diff --git a/tests/models/language/generation/test_phimoe.py b/tests/models/language/generation/test_phimoe.py
index 603ca1cb1..6c9cc2821 100644
--- a/tests/models/language/generation/test_phimoe.py
+++ b/tests/models/language/generation/test_phimoe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py
index 0c8ac2ab1..07bc9f447 100644
--- a/tests/models/language/pooling/embed_utils.py
+++ b/tests/models/language/pooling/embed_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
 from typing import Optional
 
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index f45168bc0..2705be25e 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
 
 import mteb
diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py
index fc0e82079..1af3c05d3 100644
--- a/tests/models/language/pooling/test_baai.py
+++ b/tests/models/language/pooling/test_baai.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 from .embed_utils import EmbedModelInfo, correctness_test_embed_models
diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py
index 57b3cb58d..4a6d781ce 100644
--- a/tests/models/language/pooling/test_classification.py
+++ b/tests/models/language/pooling/test_classification.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 from transformers import AutoModelForSequenceClassification
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index 8f82c8091..9516a0142 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 from vllm.config import PoolerConfig
diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
index f450edd82..c2f70bb64 100644
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import importlib.util
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index 725e3d168..2178a815b 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 
 import pytest
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
index 0403a20a4..2adf34b29 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from functools import partial
 
 import pytest
diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py
index 92cd7cc56..59dbd74fb 100644
--- a/tests/models/language/pooling/test_nomic.py
+++ b/tests/models/language/pooling/test_nomic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py
index 68603e628..250b3a528 100644
--- a/tests/models/language/pooling/test_nomic_max_model_len.py
+++ b/tests/models/language/pooling/test_nomic_max_model_len.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: SIM117
 import pytest
 
diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py
index 6b10aeffc..c75ff1445 100644
--- a/tests/models/language/pooling/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 import torch.nn.functional as F
diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py
index c6c2d1e7a..d6b5dbd08 100644
--- a/tests/models/language/pooling/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py
index 1b8ac395e..33aff1c87 100644
--- a/tests/models/language/pooling/test_truncation_control.py
+++ b/tests/models/language/pooling/test_truncation_control.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index e4e48f995..a5bbcfc22 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Common tests for testing .generate() functionality for single / multiple
 image, embedding, and video support for different VLMs in vLLM.
 """
diff --git a/tests/models/multimodal/generation/test_florence2.py b/tests/models/multimodal/generation/test_florence2.py
index b8225f5f1..b048cec5e 100644
--- a/tests/models/multimodal/generation/test_florence2.py
+++ b/tests/models/multimodal/generation/test_florence2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index 96c444441..14552010d 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from typing import Optional
diff --git a/tests/models/multimodal/generation/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py
index 972db40e8..949c0a80d 100644
--- a/tests/models/multimodal/generation/test_interleaved.py
+++ b/tests/models/multimodal/generation/test_interleaved.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/multimodal/generation/test_mllama.py b/tests/models/multimodal/generation/test_mllama.py
index 99aa3c2d3..2bb01e494 100644
--- a/tests/models/multimodal/generation/test_mllama.py
+++ b/tests/models/multimodal/generation/test_mllama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, overload
 
diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py
index e51dbee47..e4cd476a9 100644
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from collections.abc import Sequence
diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
index 506b71472..1def825ab 100644
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 from dataclasses import asdict
 from typing import TYPE_CHECKING, Any, Optional
diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
index 6be401b77..a2793b8c8 100644
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional, TypedDict, Union
 
diff --git a/tests/models/multimodal/generation/test_ultravox.py b/tests/models/multimodal/generation/test_ultravox.py
index 2c8a06688..e7e7bd315 100644
--- a/tests/models/multimodal/generation/test_ultravox.py
+++ b/tests/models/multimodal/generation/test_ultravox.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from typing import Any
diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py
index d0b85842a..363d55153 100644
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import pytest
diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py
index 32117c8d8..7d20dd660 100644
--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Helpers for building inputs that can be leveraged for different test types.
 """
 from collections.abc import Iterable
diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
index a5077a090..336e2dd2b 100644
--- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py
+++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utils for determining which subset of model tests belong to a specific
 modality, getting all combinations (similar to pytest's parametrization),
 handling multimodal placeholder substitution, and so on.
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index ccd2799ab..8c83d8f8a 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Core test implementation to be shared across modalities."""
 from typing import Any, Callable, Optional
 
diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
index cc1045561..aa5835243 100644
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom input builders for edge-cases in different models."""
 from io import BytesIO
 from typing import Callable
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index dc1ea5208..1b087191f 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Common utility functions relating to different models that are useful
 for manipulating the input / output of HF & vLLM test runners, which are
 typically specific to a small subset of models.
diff --git a/tests/models/multimodal/generation/vlm_utils/runners.py b/tests/models/multimodal/generation/vlm_utils/runners.py
index 9e8a1262e..562f89df1 100644
--- a/tests/models/multimodal/generation/vlm_utils/runners.py
+++ b/tests/models/multimodal/generation/vlm_utils/runners.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Entrypoints for wrapping the core run_test implementation for specific test
 types / modalities.
 """
diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
index 1c2bb4d62..0ec7909e7 100644
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Types for writing multimodal model tests."""
 from collections.abc import Iterable
 from enum import Enum
diff --git a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
index ea1caec0e..3734d87b7 100644
--- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable
 
diff --git a/tests/models/multimodal/pooling/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py
index 76f9fbe02..3e2be34a5 100644
--- a/tests/models/multimodal/pooling/test_intern_vit.py
+++ b/tests/models/multimodal/pooling/test_intern_vit.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 import torch.nn as nn
diff --git a/tests/models/multimodal/pooling/test_llava_next.py b/tests/models/multimodal/pooling/test_llava_next.py
index 77508738c..b6d90d2b0 100644
--- a/tests/models/multimodal/pooling/test_llava_next.py
+++ b/tests/models/multimodal/pooling/test_llava_next.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch.nn.functional as F
diff --git a/tests/models/multimodal/pooling/test_phi3v.py b/tests/models/multimodal/pooling/test_phi3v.py
index cd58a5cb4..b42ac6fb2 100644
--- a/tests/models/multimodal/pooling/test_phi3v.py
+++ b/tests/models/multimodal/pooling/test_phi3v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch.nn.functional as F
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 2377fef82..be574435e 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import partial
 from typing import Optional, Union
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 37142b6dd..76e4acc67 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for H2OVL's multimodal preprocessing kwargs."""
 from collections.abc import Mapping
 from typing import Optional
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index c35ce2f6a..d3a55993e 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for Idefics3's multimodal preprocessing kwargs."""
 import pytest
 from transformers import Idefics3Config
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index 7ec81197a..c3e2841a8 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for InternVL's multimodal preprocessing kwargs."""
 from collections.abc import Mapping
 from typing import Optional
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index 614f17dbb..9ef7af556 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for Llama4's multimodal preprocessing kwargs."""
 
 import pytest
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index b82bfe483..ca34d1d75 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from functools import partial
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index dcc8dc8da..e6344c4e7 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from functools import partial
diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py
index 9bd2b9887..9387212e3 100644
--- a/tests/models/multimodal/processing/test_minimax_vl_01.py
+++ b/tests/models/multimodal/processing/test_minimax_vl_01.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from PIL import Image
diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py
index d4794396f..a6b20a1e3 100644
--- a/tests/models/multimodal/processing/test_mllama.py
+++ b/tests/models/multimodal/processing/test_mllama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for mllama's multimodal preprocessing and profiling."""
 import pytest
 from transformers import MllamaConfig
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index b53351544..1f3646f79 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for phi3v's multimodal preprocessing kwargs."""
 import pytest
 
diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py
index c6e272650..f16d261c2 100644
--- a/tests/models/multimodal/processing/test_phi4mm.py
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for phi4mm's multimodal preprocessing kwargs."""
 import pytest
 
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index 02abe1ca8..9d1cd1833 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py
index 224d1bced..af8f98338 100644
--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for smolvlm's multimodal preprocessing kwargs."""
 import pytest
 from transformers import SmolVLMConfig
diff --git a/tests/models/quantization/test_aqlm.py b/tests/models/quantization/test_aqlm.py
index 1272a6297..de6851e2f 100644
--- a/tests/models/quantization/test_aqlm.py
+++ b/tests/models/quantization/test_aqlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 from tests.quantization.utils import is_quant_method_supported
diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py
index 597c8e48f..bd6961989 100644
--- a/tests/models/quantization/test_awq.py
+++ b/tests/models/quantization/test_awq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/models/quantization/test_bitblas.py b/tests/models/quantization/test_bitblas.py
index f0781394d..754ac9a29 100644
--- a/tests/models/quantization/test_bitblas.py
+++ b/tests/models/quantization/test_bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the outputs of a GPTQ model to a bitblas model.
 
 Note: GPTQ and bitblas do not have bitwise correctness.
diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py
index e01ee2026..10914abf9 100644
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # flake8: noqa
 """Tests fp8 models against ground truth generation
diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py
index 5f17d1228..eafdfd1b0 100644
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Tests gguf models against unquantized models generations
 Note: To pass the test, quantization higher than Q4 should be used
diff --git a/tests/models/quantization/test_gptq_bitblas.py b/tests/models/quantization/test_gptq_bitblas.py
index c8e96455f..c3aed7752 100644
--- a/tests/models/quantization/test_gptq_bitblas.py
+++ b/tests/models/quantization/test_gptq_bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the outputs of a GPTQ model to a bitblas model.
 
 Note: GPTQ and bitblas do not have bitwise correctness.
diff --git a/tests/models/quantization/test_gptq_marlin.py b/tests/models/quantization/test_gptq_marlin.py
index 397bdb981..db70a3bd2 100644
--- a/tests/models/quantization/test_gptq_marlin.py
+++ b/tests/models/quantization/test_gptq_marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compares the outputs of gptq vs gptq_marlin.
 
 Note: GPTQ and Marlin do not have bitwise correctness.
diff --git a/tests/models/quantization/test_gptq_marlin_24.py b/tests/models/quantization/test_gptq_marlin_24.py
index 6fb24b1f4..9b86ae95b 100644
--- a/tests/models/quantization/test_gptq_marlin_24.py
+++ b/tests/models/quantization/test_gptq_marlin_24.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the outputs of a GPTQ model to a Marlin_24 model.
 
 Note: GPTQ and Marlin_24 do not have bitwise correctness.
diff --git a/tests/models/quantization/test_modelopt.py b/tests/models/quantization/test_modelopt.py
index 1d9aa4fa8..6ad526cc8 100644
--- a/tests/models/quantization/test_modelopt.py
+++ b/tests/models/quantization/test_modelopt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # flake8: noqa
 """Tests Model Optimizer fp8 models against ground truth generation
diff --git a/tests/models/quantization/test_mxfp4.py b/tests/models/quantization/test_mxfp4.py
index 9a0608295..7b8a334bb 100644
--- a/tests/models/quantization/test_mxfp4.py
+++ b/tests/models/quantization/test_mxfp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # flake8: noqa
 """Tests Quark mxfp4 models against ground truth generation
 """
diff --git a/tests/models/quantization/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py
index 510858c2d..b95dad9a4 100644
--- a/tests/models/quantization/test_nvfp4.py
+++ b/tests/models/quantization/test_nvfp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # flake8: noqa
 """Tests Model Optimizer nvfp4 models against ground truth generation
 Note: these tests will only pass on B200
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 182a9668e..ed49676a9 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Mapping, Set
 from dataclasses import dataclass, field
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index d403cb392..af023d903 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import patch
 
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index b62720caa..ef0ad613d 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 3282284b6..b7527ca27 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
 
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 1a51b4aea..b7b99ce41 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test the functionality of the Transformers backend."""
 from typing import Any, Optional, Union
 
diff --git a/tests/models/test_utils.py b/tests/models/test_utils.py
index a16384efe..b52327a18 100644
--- a/tests/models/test_utils.py
+++ b/tests/models/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py
index d64c0e6d4..310d3a371 100644
--- a/tests/models/test_vision.py
+++ b/tests/models/test_vision.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/models/utils.py b/tests/models/utils.py
index ffc904bd1..943b4f570 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
 from collections.abc import Sequence
diff --git a/tests/mq_llm_engine/conftest.py b/tests/mq_llm_engine/conftest.py
index 1a20e2c13..375b248eb 100644
--- a/tests/mq_llm_engine/conftest.py
+++ b/tests/mq_llm_engine/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py
index 808346b5e..5ff08cbb3 100644
--- a/tests/mq_llm_engine/test_abort.py
+++ b/tests/mq_llm_engine/test_abort.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that aborting is handled properly."""
 
 import asyncio
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
index e617bd057..49b02279d 100644
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that various errors are handled properly."""
 
 import asyncio
diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py
index 2069ff987..e9fd5b814 100644
--- a/tests/mq_llm_engine/test_load.py
+++ b/tests/mq_llm_engine/test_load.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that the MQLLMEngine is able to handle 10k concurrent requests."""
 
 import asyncio
diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py
index 64559609a..7976d5031 100644
--- a/tests/mq_llm_engine/utils.py
+++ b/tests/mq_llm_engine/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import multiprocessing
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index ce716e647..56e339d48 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Test the AsyncLLMEngine with multi-step-decoding
 from typing import Optional
diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py
index a823e484b..9f1b3bbe8 100644
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Test the LLMEngine with multi-step-decoding
 
diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py
index 17b36b368..b5048c8cc 100644
--- a/tests/multimodal/test_hasher.py
+++ b/tests/multimodal/test_hasher.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 
 import numpy as np
diff --git a/tests/multimodal/test_image.py b/tests/multimodal/test_image.py
index 56b5475c9..cfd44351a 100644
--- a/tests/multimodal/test_image.py
+++ b/tests/multimodal/test_image.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 
 import numpy as np
diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py
index f5d3e282f..ffb3a6fe8 100644
--- a/tests/multimodal/test_inputs.py
+++ b/tests/multimodal/test_inputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 59f7bf8fa..8b52911c6 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import nullcontext
 from types import MethodType
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index f1e45da30..e4debb47c 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 import mimetypes
diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py
index e67624ece..9a700808d 100644
--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import numpy as np
 import numpy.typing as npt
 import pytest
diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py
index 40fcfeeea..23346509a 100644
--- a/tests/multimodal/utils.py
+++ b/tests/multimodal/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import numpy as np
 from PIL import Image
diff --git a/tests/neuron/1_core/test_activation.py b/tests/neuron/1_core/test_activation.py
index ec2b1238e..2d6e5f523 100644
--- a/tests/neuron/1_core/test_activation.py
+++ b/tests/neuron/1_core/test_activation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/neuron/1_core/test_block_table.py b/tests/neuron/1_core/test_block_table.py
index 033a36b41..efec56360 100644
--- a/tests/neuron/1_core/test_block_table.py
+++ b/tests/neuron/1_core/test_block_table.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import neuronxcc.nki.language as nl
 import pytest
diff --git a/tests/neuron/1_core/test_cache.py b/tests/neuron/1_core/test_cache.py
index 3d869cd2f..670889ad6 100644
--- a/tests/neuron/1_core/test_cache.py
+++ b/tests/neuron/1_core/test_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/neuron/1_core/test_layernorm.py b/tests/neuron/1_core/test_layernorm.py
index e96df8db6..c6fce1d1a 100644
--- a/tests/neuron/1_core/test_layernorm.py
+++ b/tests/neuron/1_core/test_layernorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/neuron/1_core/test_logits_processor.py b/tests/neuron/1_core/test_logits_processor.py
index 6d1514088..ce9eadf5a 100644
--- a/tests/neuron/1_core/test_logits_processor.py
+++ b/tests/neuron/1_core/test_logits_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from unittest.mock import patch
diff --git a/tests/neuron/1_core/test_neuron_model_runner.py b/tests/neuron/1_core/test_neuron_model_runner.py
index 92417fb64..5f3268810 100644
--- a/tests/neuron/1_core/test_neuron_model_runner.py
+++ b/tests/neuron/1_core/test_neuron_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from unittest.mock import MagicMock
 
diff --git a/tests/neuron/1_core/test_neuron_quant.py b/tests/neuron/1_core/test_neuron_quant.py
index 68f0cb805..086300269 100644
--- a/tests/neuron/1_core/test_neuron_quant.py
+++ b/tests/neuron/1_core/test_neuron_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.model_executor.layers.quantization.neuron_quant import (
     NeuronQuantConfig)
 
diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py
index 8f7e711b5..8b9a5f6e4 100644
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/neuron/1_core/test_rotary_embedding.py b/tests/neuron/1_core/test_rotary_embedding.py
index da57631fc..a7ac79729 100644
--- a/tests/neuron/1_core/test_rotary_embedding.py
+++ b/tests/neuron/1_core/test_rotary_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Tests for miscellaneous utilities
 """
diff --git a/tests/neuron/2_core/test_comm_ops.py b/tests/neuron/2_core/test_comm_ops.py
index 3cad160b2..85a48dae5 100644
--- a/tests/neuron/2_core/test_comm_ops.py
+++ b/tests/neuron/2_core/test_comm_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 from typing import Callable
 from unittest.mock import patch
diff --git a/tests/neuron/2_core/test_eagle.py b/tests/neuron/2_core/test_eagle.py
index d71c88689..cac642af0 100644
--- a/tests/neuron/2_core/test_eagle.py
+++ b/tests/neuron/2_core/test_eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import os
diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py
index 3e651502d..d02fff943 100644
--- a/tests/neuron/2_core/test_mistral.py
+++ b/tests/neuron/2_core/test_mistral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 
diff --git a/tests/neuron/2_core/test_multi_lora.py b/tests/neuron/2_core/test_multi_lora.py
index 6fa8f9128..6b97f47d4 100644
--- a/tests/neuron/2_core/test_multi_lora.py
+++ b/tests/neuron/2_core/test_multi_lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from huggingface_hub import snapshot_download
 
diff --git a/tests/plugins/lora_resolvers/test_filesystem_resolver.py b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
index cb0f0c3c5..3e2c2577d 100644
--- a/tests/plugins/lora_resolvers/test_filesystem_resolver.py
+++ b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import shutil
 
diff --git a/tests/plugins/vllm_add_dummy_model/setup.py b/tests/plugins/vllm_add_dummy_model/setup.py
index e3fb6efb2..6307bb638 100644
--- a/tests/plugins/vllm_add_dummy_model/setup.py
+++ b/tests/plugins/vllm_add_dummy_model/setup.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from setuptools import setup
 
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
index 0c431cb39..b2085b01c 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import ModelRegistry
 
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
index bc4a41cdf..aff349856 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional, Union
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index c23ab6430..da97cf7e2 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
index bbd11ed4a..8c34407e3 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py
index 10df0b5e0..e40f62f77 100644
--- a/tests/plugins/vllm_add_dummy_platform/setup.py
+++ b/tests/plugins/vllm_add_dummy_platform/setup.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from setuptools import setup
 
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
index 0d1b062ac..1b28342eb 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
index 33425bbc1..f30a36f35 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.attention.backends.flash_attn import FlashAttentionBackend
 
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index 5cefafc7e..67cd5ed3b 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.platforms.cuda import CudaPlatform
 
diff --git a/tests/plugins_tests/conftest.py b/tests/plugins_tests/conftest.py
index 8561f2ddf..c8c1b81ca 100644
--- a/tests/plugins_tests/conftest.py
+++ b/tests/plugins_tests/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 207de53ab..685a8cd2c 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py
index 4c95a52a9..8c2121610 100644
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py
index 4cc399175..f00a8f699 100644
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the with and without prefix caching.
 
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 607b6c43e..a65fc934b 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the with and without prefix caching.
 
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
diff --git a/tests/prompt_adapter/test_bloom.py b/tests/prompt_adapter/test_bloom.py
index a31d8e873..2b603fe8f 100644
--- a/tests/prompt_adapter/test_bloom.py
+++ b/tests/prompt_adapter/test_bloom.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/prompt_adapter/test_multi_adapter_inference.py b/tests/prompt_adapter/test_multi_adapter_inference.py
index e249a6e64..4f273afb4 100644
--- a/tests/prompt_adapter/test_multi_adapter_inference.py
+++ b/tests/prompt_adapter/test_multi_adapter_inference.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import EngineArgs, LLMEngine, SamplingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
diff --git a/tests/prompt_adapter/test_pa_lora.py b/tests/prompt_adapter/test_pa_lora.py
index fb4c3e149..ba2e15b81 100644
--- a/tests/prompt_adapter/test_pa_lora.py
+++ b/tests/prompt_adapter/test_pa_lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from huggingface_hub import snapshot_download
 
diff --git a/tests/quantization/test_auto_round.py b/tests/quantization/test_auto_round.py
index 81ceecdb4..1c41d904b 100644
--- a/tests/quantization/test_auto_round.py
+++ b/tests/quantization/test_auto_round.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test model set-up and inference for quantized HF models supported
  on the AutoRound.
 
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index e8ddfd7fc..325a902b3 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 '''Tests whether bitsandbytes computation is enabled correctly.
 
 Run `pytest tests/quantization/test_bitsandbytes.py`.
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index c968a68f1..807b24d4e 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test model set-up and weight loading for llmcompressor-quantized models.
 
 Run `pytest tests/quantization/test_compressed_tensors.py`.
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index e30166842..8b0ffc0fe 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests whether Marlin models can be loaded from the autogptq config.
 
 Run `pytest tests/quantization/test_configs.py --forked`.
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index a05eb494c..08d9573ec 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -1,4 +1,5 @@
-# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Expanded quantized model tests for CPU offloading
 # Base tests: tests/basic_correctness/test_cpu_offload.py
diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py
index b6db6d5f2..50179b9a9 100644
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # flake8: noqa
 """Tests experts_int8 quantization startup and generation, 
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index e74e14a0d..e5ab7b3dd 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests whether FP8 computation is enabled correctly.
 
 Run `pytest tests/quantization/test_fp8.py --forked`.
diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
index 22055c49a..23b999e7c 100644
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests whether gptq models with dynamic quantized can be loaded.
 
 Run `pytest tests/quantization/test_gptq_dynamic.py --forked`.
diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py
index 0e3913676..34b1b6c2e 100644
--- a/tests/quantization/test_ipex_quant.py
+++ b/tests/quantization/test_ipex_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test model set-up and inference for quantized HF models supported
  on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
  
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index 1c6bd1852..11f78a23b 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests whether gptq models with quantized lm_head can be loaded.
 
 Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py
index 9bbb5e327..5f78bc305 100644
--- a/tests/quantization/test_ptpc_fp8.py
+++ b/tests/quantization/test_ptpc_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests whether PTPC w8a8 FP8 computation is enabled correctly.
 
 Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index ae09ac58e..3571f773f 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test model set-up and weight loading for quark-quantized models.
 
 Run `pytest tests/quantization/test_quark.py`.
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index 0ea71aaf8..42081a8c6 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests register custom quantization config.
 
 See https://github.com/vllm-project/vllm/issues/11926 for more details.
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
index 6571fc9e4..c966dc9b8 100644
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib.metadata
 import importlib.util
 
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
index 7a339c162..20a425b72 100644
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.model_executor.layers.quantization import get_quantization_config
 from vllm.platforms import current_platform
diff --git a/tests/reasoning/test_deepseekr1_reasoning_parser.py b/tests/reasoning/test_deepseekr1_reasoning_parser.py
index 1b669c8fd..987f3c48d 100644
--- a/tests/reasoning/test_deepseekr1_reasoning_parser.py
+++ b/tests/reasoning/test_deepseekr1_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from transformers import AutoTokenizer
diff --git a/tests/reasoning/test_granite_reasoning_parser.py b/tests/reasoning/test_granite_reasoning_parser.py
index 48fb8c2f8..38cab73a4 100644
--- a/tests/reasoning/test_granite_reasoning_parser.py
+++ b/tests/reasoning/test_granite_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 from transformers import AutoTokenizer
 
diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py
index 95b7460d3..2d5557d5c 100644
--- a/tests/reasoning/test_qwen3_reasoning_parser.py
+++ b/tests/reasoning/test_qwen3_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from transformers import AutoTokenizer
diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py
index 0f894ed80..ddcf89796 100644
--- a/tests/reasoning/utils.py
+++ b/tests/reasoning/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
index 8b96184f5..e27d9958f 100644
--- a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
+++ b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import SamplingParams
 from vllm.config import LoadConfig, LoadFormat
diff --git a/tests/runai_model_streamer_test/test_weight_utils.py b/tests/runai_model_streamer_test/test_weight_utils.py
index 06e506c35..ee448c2cc 100644
--- a/tests/runai_model_streamer_test/test_weight_utils.py
+++ b/tests/runai_model_streamer_test/test_weight_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import glob
 import tempfile
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index 5de1137ea..bdf48c768 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the outputs of HF and vLLM when using beam search.
 
 Run `pytest tests/samplers/test_beam_search.py`.
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index 2a124aa0c..7eb9c0b5f 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Make sure ignore_eos works.
 
 Run `pytest tests/samplers/test_ignore_eos.py`.
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 74f1eb4a9..901c87591 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 5cc646e76..86c8a03ee 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index f9688b4b9..42b529ae1 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Make sure bad_words works.
 
 Run `pytest tests/samplers/test_no_bad_words.py`.
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index ebe9b3021..86fc14dc8 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 6ef61f2ff..3b93c6411 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for rejection sampling."""
 
 import pytest
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 7b19d5750..520b88d03 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 import random
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
index efa2642db..b339b4b2d 100644
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Verify that seeded random sampling is deterministic.
 
 Run `pytest tests/samplers/test_seeded_generate.py`.
diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py
index 279e5ed10..418471b8e 100644
--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for rejection sampling."""
 
 import pytest
diff --git a/tests/spec_decode/conftest.py b/tests/spec_decode/conftest.py
index 1a20e2c13..375b248eb 100644
--- a/tests/spec_decode/conftest.py
+++ b/tests/spec_decode/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 921081f3c..f3fe9db3f 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from itertools import cycle
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index 4fd52cf7e..6c453879a 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py
index eee535a14..989394614 100644
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py
index 9dfc1b2fd..760861850 100644
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests which cover integration of the speculative decoding framework with
 other features, e.g. cuda graphs.
 """
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index b11297475..a18be80c5 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests which cover integration of the speculative decoding framework with
 tensor parallelism.
 """
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
index a1b7c8b40..039eec8fd 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests which cover integration of the speculative decoding framework with
 tensor parallelism.
 """
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index cb2dae541..1629c69f8 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import cycle
 
diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py
index 5c60100e6..064a6e10a 100644
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index 7bf29349d..9f778ca8d 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_mtp_correctness.py b/tests/spec_decode/e2e/test_mtp_correctness.py
index 371e6834b..d4d4d519b 100644
--- a/tests/spec_decode/e2e/test_mtp_correctness.py
+++ b/tests/spec_decode/e2e/test_mtp_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index e187b6bc1..6d385184d 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """The tests in this file verify end-to-end speculative decoding correctness.
 
 This docstring details important information on the testing methodology.
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
index eca433ffa..c10329a9b 100644
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py
index 3dc371722..4cf373809 100644
--- a/tests/spec_decode/e2e/test_seed.py
+++ b/tests/spec_decode/e2e/test_seed.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index 9edd8bd4c..d20c549b0 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py
index 0bff0ea1d..407786ad3 100644
--- a/tests/spec_decode/test_dynamic_spec_decode.py
+++ b/tests/spec_decode/test_dynamic_spec_decode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import MagicMock, patch
 
diff --git a/tests/spec_decode/test_memory_usage.py b/tests/spec_decode/test_memory_usage.py
index 16dffe6d7..5d9dd3f72 100644
--- a/tests/spec_decode/test_memory_usage.py
+++ b/tests/spec_decode/test_memory_usage.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 This test verifies that memory usage remains constant (or never grows) when 
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
index 1a6693e16..e8de410f8 100644
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from unittest.mock import MagicMock
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index ca37c9a68..f2d93203b 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from unittest.mock import MagicMock
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
index 7de54b3ed..8a7c11485 100644
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py
index f73cf4b34..55fcf0055 100644
--- a/tests/spec_decode/test_scorer.py
+++ b/tests/spec_decode/test_scorer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index f7ef9786a..8aceaadff 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from collections import defaultdict
diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py
index 24573e224..9cfc618b9 100644
--- a/tests/spec_decode/test_utils.py
+++ b/tests/spec_decode/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import MagicMock
 
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index d303b7f12..1733f66fe 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence as GenericSequence
 from itertools import count
diff --git a/tests/standalone_tests/lazy_imports.py b/tests/standalone_tests/lazy_imports.py
index 61e3b3879..21bcb6b82 100644
--- a/tests/standalone_tests/lazy_imports.py
+++ b/tests/standalone_tests/lazy_imports.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Description: Test the lazy import module
 # The utility function cannot be placed in `vllm.utils`
diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py
index ce8689f5b..cd59d579e 100644
--- a/tests/tensorizer_loader/conftest.py
+++ b/tests/tensorizer_loader/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 from vllm.distributed import cleanup_dist_env_and_memory
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 747ec56ad..c97f5968d 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import gc
 import os
diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index 05d2c624d..edc0849df 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test hashing of cache blocks.
 
 Run `pytest tests/test_cache_block_hashing.py`.
diff --git a/tests/test_config.py b/tests/test_config.py
index 7db95e3f6..dffea9138 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import MISSING, Field, asdict, dataclass, field
 from typing import Literal, Union
diff --git a/tests/test_embedded_commit.py b/tests/test_embedded_commit.py
index a9b4f5cbf..b9593e2a3 100644
--- a/tests/test_embedded_commit.py
+++ b/tests/test_embedded_commit.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import vllm
 
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index d361808ed..e549834fa 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/test_logger.py b/tests/test_logger.py
index 046f70504..8f235f147 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import enum
 import json
 import logging
diff --git a/tests/test_outputs.py b/tests/test_outputs.py
index c41bd6723..4bb1c20f7 100644
--- a/tests/test_outputs.py
+++ b/tests/test_outputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.outputs import RequestOutput
 
diff --git a/tests/test_regression.py b/tests/test_regression.py
index e09294542..f5f1ed8e8 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Containing tests that check for regressions in vLLM's behavior.
 
 It should include tests that are reported by users and making sure they
diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py
index 9af810c4c..39e3808d8 100644
--- a/tests/test_sampling_params.py
+++ b/tests/test_sampling_params.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the SamplingParams class.
 """
 
diff --git a/tests/test_scalartype.py b/tests/test_scalartype.py
index eecfa1db3..ef4aef3af 100644
--- a/tests/test_scalartype.py
+++ b/tests/test_scalartype.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/test_seed_behavior.py b/tests/test_seed_behavior.py
index c45ed6926..e9138b9e8 100644
--- a/tests/test_seed_behavior.py
+++ b/tests/test_seed_behavior.py
@@ -1,4 +1,5 @@
-# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
 
 import numpy as np
diff --git a/tests/test_sequence.py b/tests/test_sequence.py
index 902de1099..a782a3bf7 100644
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index 77fec0968..64706defb 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import multiprocessing as mp
 import os
diff --git a/tests/test_triton_utils.py b/tests/test_triton_utils.py
index eb8ad48fd..64f72668f 100644
--- a/tests/test_triton_utils.py
+++ b/tests/test_triton_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import sys
 import types
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 42e0df1ff..a2fd845ea 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa
 
 import asyncio
diff --git a/tests/test_version.py b/tests/test_version.py
index 56842b6d4..fd07abb59 100644
--- a/tests/test_version.py
+++ b/tests/test_version.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import patch
 
diff --git a/tests/test_vllm_port.py b/tests/test_vllm_port.py
index ccbb36bf4..88e1efd8f 100644
--- a/tests/test_vllm_port.py
+++ b/tests/test_vllm_port.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from unittest.mock import patch
diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py
index c740fde42..e218678c4 100644
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenization/test_cached_tokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pickle
 from copy import deepcopy
 
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 079100e78..b289dc972 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Generator
 from typing import Any, Optional
diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py
index 8942f8891..d82884293 100644
--- a/tests/tokenization/test_get_eos.py
+++ b/tests/tokenization/test_get_eos.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This test file includes some cases where it is inappropriate to
 only get the `eos_token_id` from the tokenizer as defined by
diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py
index b16d9af35..69b3c6294 100644
--- a/tests/tokenization/test_mistral_tokenizer.py
+++ b/tests/tokenization/test_mistral_tokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from mistral_common.protocol.instruct.messages import (AssistantMessage,
diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py
index eddc63098..09a3638fd 100644
--- a/tests/tokenization/test_tokenizer.py
+++ b/tests/tokenization/test_tokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from transformers import PreTrainedTokenizerBase
diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
index bcfa78ed4..0570c1525 100644
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py
index 772eeb345..5abb10164 100644
--- a/tests/tokenization/test_tokenizer_registry.py
+++ b/tests/tokenization/test_tokenizer_registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Any, Optional, Union
 
diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py
index 4bf9b45fe..510b54790 100644
--- a/tests/tool_use/conftest.py
+++ b/tests/tool_use/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import pytest_asyncio
diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py
index ba0ad78f6..a30c58b09 100644
--- a/tests/tool_use/test_chat_completion_request_validations.py
+++ b/tests/tool_use/test_chat_completion_request_validations.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
index 448347be6..8c01c86e2 100644
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index a40675744..351531393 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Generator
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
index 910e0b2d5..fff20c68d 100644
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from typing import Optional
diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py
index b320b335e..53ba03a0a 100644
--- a/tests/tool_use/test_tool_calls.py
+++ b/tests/tool_use/test_tool_calls.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from typing import Optional
diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py
index 291769848..3b43b723d 100644
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 from copy import deepcopy
 from unittest.mock import MagicMock
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index efa6455c4..a17fab9ae 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import deepcopy
 from typing import Any, Optional
diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py
index 21d7fce69..b26bdd34d 100644
--- a/tests/tpu/lora/test_lora.py
+++ b/tests/tpu/lora/test_lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 import vllm
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index 06e00187c..3a180c679 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import glob
 import os
diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
index acb6b90f5..9c90df1b7 100644
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/tpu/test_moe_pallas.py b/tests/tpu/test_moe_pallas.py
index 19df22f78..ab6cd3069 100644
--- a/tests/tpu/test_moe_pallas.py
+++ b/tests/tpu/test_moe_pallas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the Pallas MOE implementation.
 
 Run `pytest tests/kernels/moe/test_moe_pallas.py`.
diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py
index 20f9dd77d..a13cf7064 100644
--- a/tests/tpu/test_quantization_accuracy.py
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index caa233ec3..4dbae7c15 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa
 # type: ignore
 from __future__ import annotations
diff --git a/tests/utils.py b/tests/utils.py
index d21b18470..ade28a481 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import copy
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 61aee8752..ad34becb1 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
 
 import pytest
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 1a7a31d98..897d181ec 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the with and without prefix caching."""
 
 from typing import Optional
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index f38454b1b..aa074f1bb 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 from unittest.mock import Mock
 
diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py
index 511d57d40..85415f6ad 100644
--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import pytest
diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py
index 4217dc37e..c6f7481dd 100644
--- a/tests/v1/core/test_specialized_manager.py
+++ b/tests/v1/core/test_specialized_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
index 48c265560..161bcd4d3 100644
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
index a125d3fb7..3eedc535d 100644
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
 import pytest
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 2fad37d68..93e7c12f3 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import random
diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py
index d04679c12..d7722142b 100644
--- a/tests/v1/engine/conftest.py
+++ b/tests/v1/engine/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 5d52ad5f5..957d50d0d 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from contextlib import ExitStack
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index 9b2f1a919..f70a3ce14 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import ArgumentError
 
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index e78c7480a..3d7632a60 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 import time
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 8bea032f6..a01b205df 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index e77916f95..6284dcfb9 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from typing import Optional
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index fac701c4c..a83454ee6 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 import time
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 4a23e0c1b..b58bc75fc 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from dataclasses import dataclass
diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py
index 8c03f0433..ffe061212 100644
--- a/tests/v1/entrypoints/conftest.py
+++ b/tests/v1/entrypoints/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 5f1fff200..a39ab47b8 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -1,5 +1,6 @@
 # ruff: noqa: E501
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/v1/entrypoints/openai/test_chat_completion.py
index c650ccd0c..dffb32846 100644
--- a/tests/v1/entrypoints/openai/test_chat_completion.py
+++ b/tests/v1/entrypoints/openai/test_chat_completion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
 import pytest
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 333ad2379..a7c31c064 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py
index 7b4583bc3..ed4ecbe84 100644
--- a/tests/v1/entrypoints/openai/test_multi_api_servers.py
+++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import os
 
diff --git a/tests/v1/kv_connector/nixl_integration/test_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
index be2d84f3b..2b2b147ce 100644
--- a/tests/v1/kv_connector/nixl_integration/test_accuracy.py
+++ b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import lm_eval
diff --git a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
index 5363fbde0..95465a25f 100644
--- a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
+++ b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import openai
diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
index 13071f581..3d720fe0c 100644
--- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
+++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import itertools
diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index a21d92c52..ddf2836d0 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import filecmp
 import shutil
 import tempfile
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 9b2a720c1..9b257143d 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
     NixlConnectorMetadata)
diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
index dc963251c..52dc21a2c 100644
--- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 
 from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT
diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
index 86eacb693..2312e2135 100644
--- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 
 from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 3c3190b32..e190e9561 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional
 
 import torch
diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py
index 02475f7c1..ea54038a2 100644
--- a/tests/v1/metrics/test_ray_metrics.py
+++ b/tests/v1/metrics/test_ray_metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import ray
 
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 3800cb392..612eca116 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from collections.abc import Generator
diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py
index f62770060..085b2ee09 100644
--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import lm_eval
 
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index cbdb0b910..f35c3e194 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional
 
 import pytest
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 24b759bc1..a2beb5ad7 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index 0512a1e02..ac0f3eb58 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import pytest
diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py
index 220f05c7f..63fdeb5a6 100644
--- a/tests/v1/sample/test_topk_topp_sampler.py
+++ b/tests/v1/sample/test_topk_topp_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 from flashinfer.sampling import top_k_renorm_probs, top_p_renorm_probs
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index 932b652ae..8c111f846 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from enum import Enum
 from typing import Optional
diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
index ed368fe82..682d84dc2 100644
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that we handle a startup Error and shutdown."""
 
 import pytest
diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 9fedbe4f9..523b7ee23 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that we handle an Error in model forward and shutdown."""
 
 import asyncio
diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
index 0fe48da47..a077d48fe 100644
--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test error handling in Processor. Should not impact other reqs."""
 
 import asyncio
diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
index 1bba19102..88fc5297a 100644
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that we handle a startup Error and shutdown."""
 
 import pytest
diff --git a/tests/v1/shutdown/utils.py b/tests/v1/shutdown/utils.py
index 8f7c0380d..124254a41 100644
--- a/tests/v1/shutdown/utils.py
+++ b/tests/v1/shutdown/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Shutdown test utils"""
 
 SHUTDOWN_TEST_TIMEOUT_SEC = 120
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index b49ac45f3..eff8eff43 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest import mock
 
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index f577fb4ab..9070d2b10 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test whether spec decoding handles the max model length properly."""
 
 import pytest
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 50548219f..ffea86d0d 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import numpy as np
 
diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py
index ffc0bceee..4e7c4b33e 100644
--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
index 366fa3b25..53242180b 100644
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
diff --git a/tests/v1/test_metrics_reader.py b/tests/v1/test_metrics_reader.py
index 68539c80b..c05de5e4c 100644
--- a/tests/v1/test_metrics_reader.py
+++ b/tests/v1/test_metrics_reader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import prometheus_client
 import pytest
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index 1b77417a1..e5eadfd4e 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import pytest
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index ee490071f..0ab4e0bf5 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import UserDict
 from dataclasses import dataclass
 from typing import Optional
diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py
index b68f08385..a3df882a9 100644
--- a/tests/v1/test_utils.py
+++ b/tests/v1/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 1c0210b6a..7117a66c2 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A basic correctness check for TPUs
 
 Run `pytest tests/v1/tpu/test_basic.py`.
diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py
index 01664598c..55fee4ee1 100644
--- a/tests/v1/tpu/test_mha_attn.py
+++ b/tests/v1/tpu/test_mha_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test:
 
diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py
index 8c87fc836..a61773a4f 100644
--- a/tests/v1/tpu/test_multimodal.py
+++ b/tests/v1/tpu/test_multimodal.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/v1/tpu/test_pallas.py b/tests/v1/tpu/test_pallas.py
index 8faa5270b..3a9d80847 100644
--- a/tests/v1/tpu/test_pallas.py
+++ b/tests/v1/tpu/test_pallas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from unittest.mock import ANY, patch
 
 import torch
diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py
index 811833f73..f4a2d5ac8 100644
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A basic performance regression test for TPUs
 
 Run `pytest tests/v1/tpu/test_perf.py`.
diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index 2bbeb3dda..198bb1e16 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
 
 import pytest
diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py
index ff9217f8f..ca5c067b3 100644
--- a/tests/v1/tpu/test_topk_topp_sampler.py
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 
 import pytest
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 348f12887..230c97e78 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import unittest.mock as mock
 
 import pytest
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 27741bd15..e932e4b32 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import inspect
 from typing import Optional
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 6ba6d1f6f..ceb9d4df2 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py
index c03943149..83be8bdce 100644
--- a/tests/vllm_test_utils/setup.py
+++ b/tests/vllm_test_utils/setup.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from setuptools import setup
 
diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py
index 1d1219fbe..2818428de 100644
--- a/tests/vllm_test_utils/vllm_test_utils/__init__.py
+++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 vllm_utils is a package for vLLM testing utilities.
 It does not import any vLLM modules.
diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py
index 3b25980cb..49fd083ef 100644
--- a/tests/vllm_test_utils/vllm_test_utils/blame.py
+++ b/tests/vllm_test_utils/vllm_test_utils/blame.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import dataclasses
diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py
index 27077f13d..9454221b2 100644
--- a/tests/vllm_test_utils/vllm_test_utils/monitor.py
+++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import dataclasses
diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index 9f99b3725..3aabae099 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/tests/worker/conftest.py b/tests/worker/conftest.py
index 372d71a78..3f202d4db 100644
--- a/tests/worker/conftest.py
+++ b/tests/worker/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
index 3e237aacc..35ac90b38 100644
--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index a41fc5217..a5e61128d 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index ae4b53652..0be25aa2f 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
index 22466105b..d8767f700 100644
--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index 3ab807099..6d9f404ac 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tools/check_spdx_header.py b/tools/check_spdx_header.py
index 709befc53..92914186b 100644
--- a/tools/check_spdx_header.py
+++ b/tools/check_spdx_header.py
@@ -1,8 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import sys
 
-SPDX_HEADER = "# SPDX-License-Identifier: Apache-2.0"
+SPDX_HEADER = (
+    "# SPDX-License-Identifier: Apache-2.0\n"
+    "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project")
 SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:"
 
 
diff --git a/tools/check_triton_import.py b/tools/check_triton_import.py
index 18c9726a1..77b2dfc39 100644
--- a/tools/check_triton_import.py
+++ b/tools/check_triton_import.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 import sys
 
diff --git a/tools/enforce_regex_import.py b/tools/enforce_regex_import.py
index 6c201dd25..63ceee582 100644
--- a/tools/enforce_regex_import.py
+++ b/tools/enforce_regex_import.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import subprocess
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 9601b578e..209c3a576 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import json
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index 8ec3dfc97..038d3c44f 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import copy
diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py
index 011af2522..7368ae953 100644
--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2018 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
diff --git a/use_existing_torch.py b/use_existing_torch.py
index 7d352c6ca..a9f79e169 100644
--- a/use_existing_torch.py
+++ b/use_existing_torch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import glob
 
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 52022fb8f..6232b657e 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 # The version.py should be independent library, and we always import the
 # version library first.  Such assumption is critical for some customization.
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 3c8e6b95c..008a7aa94 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import importlib
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index a9a624b85..ae63e0603 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py
index 9cc2b181f..9753a0880 100644
--- a/vllm/adapter_commons/layers.py
+++ b/vllm/adapter_commons/layers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 
diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py
index a84fbea2e..7b685880a 100644
--- a/vllm/adapter_commons/models.py
+++ b/vllm/adapter_commons/models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Optional, TypeVar
diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py
index 2b604b91b..8135b54ba 100644
--- a/vllm/adapter_commons/request.py
+++ b/vllm/adapter_commons/request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 
diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py
index 46e9629e1..a1a56b6bb 100644
--- a/vllm/adapter_commons/utils.py
+++ b/vllm/adapter_commons/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py
index 3c1d26404..07e85d138 100644
--- a/vllm/adapter_commons/worker_manager.py
+++ b/vllm/adapter_commons/worker_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Any, Optional
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index a21eb7f59..1c1623084 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from pathlib import Path
diff --git a/vllm/assets/base.py b/vllm/assets/base.py
index 03f3b9dab..31cde431b 100644
--- a/vllm/assets/base.py
+++ b/vllm/assets/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import lru_cache
 from pathlib import Path
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index d8cca9b74..c977242a3 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Literal
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index bf06746a9..01834aeeb 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from functools import lru_cache
diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py
index 85c5715fa..344040586 100644
--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.attention.backends.abstract import (AttentionBackend,
                                               AttentionMetadata,
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index f3d6ffaeb..deb3951d6 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index ea4f84072..a2fd557f8 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Tuple, Type
diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py
index 4567893a9..39e667bca 100644
--- a/vllm/attention/backends/cpu_mla.py
+++ b/vllm/attention/backends/cpu_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py
index eceab1f1a..3548df88d 100644
--- a/vllm/attention/backends/dual_chunk_flash_attn.py
+++ b/vllm/attention/backends/dual_chunk_flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with Dual chunk flash attention and sparse attention.
 """
 import math
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 7f8f720ee..26be2c04f 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
 from collections import defaultdict
 from dataclasses import dataclass
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 37b20d073..7ae7ea37f 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import os
diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py
index 0e62748dd..9a6b8a40e 100644
--- a/vllm/attention/backends/flashmla.py
+++ b/vllm/attention/backends/flashmla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
 from dataclasses import dataclass
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index d701c59a2..5128e4975 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index f322c7b3d..30441b3ad 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """ Attention layer with torch scaled_dot_product_attention
     and PagedAttention."""
 from dataclasses import dataclass
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 1007140ef..50842abd3 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 # MLA Common Components
 
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 19642a939..a6823ac05 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index f1def25c8..820ddcab7 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import defaultdict
 from dataclasses import dataclass
diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py
index c974f2a15..855036071 100644
--- a/vllm/attention/backends/rocm_aiter_mla.py
+++ b/vllm/attention/backends/rocm_aiter_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
 from dataclasses import dataclass
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 7134472da..755e0da06 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer ROCm GPUs."""
 import itertools
 from dataclasses import dataclass
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index c1bd638f2..760634004 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """ Attention layer with torch scaled_dot_product_attention
     and PagedAttention."""
 from dataclasses import dataclass
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
index 6945c2c6e..d9fff8fac 100644
--- a/vllm/attention/backends/triton_mla.py
+++ b/vllm/attention/backends/triton_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Dict, List, Optional, Type
 
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index a281c9771..e3f02a193 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention backend utils"""
 from collections import defaultdict
 from contextlib import contextmanager
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index a9d4a70b5..8355e0397 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with xFormers and PagedAttention."""
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 9e4fbe0b4..6c5b05a5c 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer."""
 from typing import Any, Dict, List, Optional
 
diff --git a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
index bc87ce33a..05fa9d11f 100644
--- a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
+++ b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py
index 6ab69ea5b..c6f6cc297 100644
--- a/vllm/attention/ops/blocksparse_attention/interface.py
+++ b/vllm/attention/ops/blocksparse_attention/interface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 
diff --git a/vllm/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py
index e64fc1139..445720c70 100644
--- a/vllm/attention/ops/blocksparse_attention/utils.py
+++ b/vllm/attention/ops/blocksparse_attention/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Helper functions for 3D sparse pattern
 # These function are not optimized and very inefficient.
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index 6ca2a6414..4f839348e 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Authors:
 #  - Burkhard Ringlein <ngl@zurich.ibm.com>
diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py
index 18b69a6b3..b85f27ac4 100644
--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/attention/ops/flashmla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/flash_mla/flash_mla_interface.py
 from typing import Optional, Tuple
 
diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py
index a97c36338..412dd20ec 100644
--- a/vllm/attention/ops/hpu_paged_attn.py
+++ b/vllm/attention/ops/hpu_paged_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
index 1702203b1..b7e4ba4d7 100644
--- a/vllm/attention/ops/ipex_attn.py
+++ b/vllm/attention/ops/ipex_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Dict, List, Optional, Tuple
 
diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/attention/ops/merge_attn_states.py
index f9fcfe6a6..5cb1a4739 100644
--- a/vllm/attention/ops/merge_attn_states.py
+++ b/vllm/attention/ops/merge_attn_states.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index 8c9145bb9..e28ff7e8b 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import neuronxcc.nki.isa as nisa
 import neuronxcc.nki.language as nl
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index 827c3041a..c6d1501e2 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 729b61b02..13bef9672 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # The kernels in this file are adapted from LightLLM's context_attention_fwd:
 # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py
diff --git a/vllm/attention/ops/rocm_aiter_mla.py b/vllm/attention/ops/rocm_aiter_mla.py
index 421891ab6..cce6b4639 100644
--- a/vllm/attention/ops/rocm_aiter_mla.py
+++ b/vllm/attention/ops/rocm_aiter_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py
index 0f3cf1842..ad97152e2 100644
--- a/vllm/attention/ops/rocm_aiter_paged_attn.py
+++ b/vllm/attention/ops/rocm_aiter_paged_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import aiter as rocm_aiter
diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py
index fb983907e..c27b377ae 100644
--- a/vllm/attention/ops/triton_decode_attention.py
+++ b/vllm/attention/ops/triton_decode_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index 62cfb813d..a26e713b1 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Fused Attention
 ===============
diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py
index 30e61b6d8..56d78ed5e 100644
--- a/vllm/attention/ops/triton_merge_attn_states.py
+++ b/vllm/attention/ops/triton_merge_attn_states.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index 87cf333f7..92c09e6dd 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Authors:
 #  - Burkhard Ringlein <ngl@zurich.ibm.com>
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index ebbdea27f..cb577fa67 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from contextlib import contextmanager
diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py
index ca88549f3..69cde06fd 100644
--- a/vllm/attention/utils/fa_utils.py
+++ b/vllm/attention/utils/fa_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 from vllm import envs
diff --git a/vllm/beam_search.py b/vllm/beam_search.py
index ddacc6695..f3bc42183 100644
--- a/vllm/beam_search.py
+++ b/vllm/beam_search.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional, Union
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 21fe3eb62..0ef3e0254 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This module defines a framework for sampling benchmark requests from various
 datasets. Each dataset subclass of BenchmarkDataset must implement sample
diff --git a/vllm/benchmarks/endpoint_request_func.py b/vllm/benchmarks/endpoint_request_func.py
index a28630d50..aba60edc5 100644
--- a/vllm/benchmarks/endpoint_request_func.py
+++ b/vllm/benchmarks/endpoint_request_func.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """The request function for API endpoints."""
 
 import io
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index dc1c42879..5c6124db8 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark the latency of processing a single batch of requests."""
 
 import argparse
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 858a0c6a0..019ebcf8d 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 r"""Benchmark online serving throughput.
 
 On the server side, run one of the following commands
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index 3ea6c194b..be9ea39f0 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark offline inference throughput."""
 import argparse
 import dataclasses
diff --git a/vllm/benchmarks/utils.py b/vllm/benchmarks/utils.py
index 45a0ddbd5..f0bb99326 100644
--- a/vllm/benchmarks/utils.py
+++ b/vllm/benchmarks/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import json
diff --git a/vllm/collect_env.py b/vllm/collect_env.py
index 86eb465b8..64172a9bf 100644
--- a/vllm/collect_env.py
+++ b/vllm/collect_env.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # ruff: noqa
 # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
@@ -6,7 +7,6 @@
 import datetime
 import locale
 import os
-import re
 import subprocess
 import sys
 # Unlike the rest of the PyTorch this file must be python2 compliant.
@@ -14,6 +14,8 @@ import sys
 # Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
 from collections import namedtuple
 
+import regex as re
+
 from vllm.envs import environment_variables
 
 try:
@@ -815,4 +817,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py
index dc3e1482e..ce4e50a2b 100644
--- a/vllm/compilation/activation_quant_fusion.py
+++ b/vllm/compilation/activation_quant_fusion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index c4bfffe92..5af3b7efe 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ast
 import dataclasses
diff --git a/vllm/compilation/base_piecewise_backend.py b/vllm/compilation/base_piecewise_backend.py
index 84d1e1f77..4d7aeeb4d 100644
--- a/vllm/compilation/base_piecewise_backend.py
+++ b/vllm/compilation/base_piecewise_backend.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Protocol
 
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index f651ee691..f754fc238 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 9293610cc..36c810ec2 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import copy
 import hashlib
diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
index 2200671b8..c584c103f 100644
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 import dataclasses
diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py
index 0ad480e28..8bf957368 100644
--- a/vllm/compilation/cuda_piecewise_backend.py
+++ b/vllm/compilation/cuda_piecewise_backend.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from contextlib import ExitStack
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index f02994c55..05e4ca9f0 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import inspect
 from typing import Callable, Optional, TypeVar, Union, overload
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 70f3b8b6d..286221d32 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import operator
 from collections.abc import Iterable
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 618b2fe94..7e2c5b4fe 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, NamedTuple, Optional
 
diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py
index b9eeb0c8d..9ef388932 100644
--- a/vllm/compilation/fx_utils.py
+++ b/vllm/compilation/fx_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import operator
 from collections.abc import Iterable
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index a9359fe1e..810d0801e 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
 import inspect
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index 786c7c1e1..1e059b59f 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import time
diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py
index cef19f925..6d1893777 100644
--- a/vllm/compilation/multi_output_match.py
+++ b/vllm/compilation/multi_output_match.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import abc
 import operator
diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py
index 13e4cd73f..46f70dcdc 100644
--- a/vllm/compilation/noop_elimination.py
+++ b/vllm/compilation/noop_elimination.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Union
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 07ebd3e1b..621c89a14 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from torch import fx as fx
 
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index 17dded87f..d41093903 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/compilation/torch25_custom_graph_pass.py b/vllm/compilation/torch25_custom_graph_pass.py
index 4b881d0b6..cd3970657 100644
--- a/vllm/compilation/torch25_custom_graph_pass.py
+++ b/vllm/compilation/torch25_custom_graph_pass.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from typing import Any, Optional
 
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
index 0fe73b72b..3ccbf52d9 100644
--- a/vllm/compilation/vllm_inductor_pass.py
+++ b/vllm/compilation/vllm_inductor_pass.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 1a8211f0a..8c8d0b5cb 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import sys
diff --git a/vllm/config.py b/vllm/config.py
index 8aa1b5610..d99e501ca 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ast
 import copy
diff --git a/vllm/connections.py b/vllm/connections.py
index 84e32a4d5..103505eb3 100644
--- a/vllm/connections.py
+++ b/vllm/connections.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Mapping, MutableMapping
 from pathlib import Path
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index d4d31c58d..444bb25f2 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from typing import List, Optional
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index 1966eac1c..a337007a9 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import deque
 from dataclasses import dataclass
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index d64142e77..ea490c327 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Dict, FrozenSet, List, Optional, Tuple
 
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 301656996..1a05881f7 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index c388366b8..dae6ead04 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import deque
 from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 1ca9e49da..2913a01bf 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Token blocks."""
 import sys
 from bisect import bisect_left
diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
index 910afdd9f..e933c6ee7 100644
--- a/vllm/core/block/utils.py
+++ b/vllm/core/block/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Block manager utils."""
 from vllm.sequence import SequenceGroup
 from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index c6bf6d163..a33399204 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A block manager that manages token blocks."""
 from typing import Dict, List, Optional
 from typing import Sequence as GenericSequence
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index 0e363eddc..7ec4768e9 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 import heapq
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index 4c1182deb..ba290eeda 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 from abc import ABC, abstractmethod
diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py
index 0f5d8ca6d..71b22942a 100644
--- a/vllm/core/placeholder_block_space_manager.py
+++ b/vllm/core/placeholder_block_space_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional, Tuple
 
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 06d4ed470..44be855b1 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 import os
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 6fcbca628..942e866ed 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # cumem-based pytorch pluggable allocator to implement sleep mode.
 # other approaches tried but failed:
diff --git a/vllm/distributed/__init__.py b/vllm/distributed/__init__.py
index 39955ddac..e911b2a1a 100644
--- a/vllm/distributed/__init__.py
+++ b/vllm/distributed/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .communication_op import *
 from .parallel_state import *
diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index d85a41dda..0a5a95176 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional, Union
 
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index 7177754a3..ae7590299 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib.util
 from typing import TYPE_CHECKING
 
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index 52b970949..38370d4dc 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import threading
 from typing import Optional
 from weakref import WeakValueDictionary
diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py
index c04218cb9..94effa0b2 100644
--- a/vllm/distributed/device_communicators/cpu_communicator.py
+++ b/vllm/distributed/device_communicators/cpu_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index a05a13f51..0eebdf873 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index 6c15ef644..2c38e8ed2 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This file is a pure Python wrapper for the cudart library.
 It avoids the need to compile a separate shared library, and is
 convenient for use when we just need to call a few functions.
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 5c2dbcc27..7dd104a4f 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
 from typing import Optional, Union
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index 11b8b57fe..7c6001e87 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ctypes
 import json
diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py
index 9536a7f88..f00f6b62b 100644
--- a/vllm/distributed/device_communicators/hpu_communicator.py
+++ b/vllm/distributed/device_communicators/hpu_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.distributed as dist
diff --git a/vllm/distributed/device_communicators/neuron_communicator.py b/vllm/distributed/device_communicators/neuron_communicator.py
index dfa4b5194..5b61a1687 100644
--- a/vllm/distributed/device_communicators/neuron_communicator.py
+++ b/vllm/distributed/device_communicators/neuron_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
 from vllm.distributed.device_communicators.base_device_communicator import (
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 0ccd42312..294862929 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
index 6f69089b6..04a4d0147 100644
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # This file is a pure Python wrapper for the NCCL library.
 # The main purpose is to use NCCL combined with CUDA graph.
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 40e57e662..0f66f0aeb 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pickle
 import time
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index a17752796..c60a7a7eb 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional
diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
index 256e7965e..216ff85c8 100644
--- a/vllm/distributed/device_communicators/xpu_communicator.py
+++ b/vllm/distributed/device_communicators/xpu_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py
index 29c6a70c4..9bf1c058a 100644
--- a/vllm/distributed/kv_events.py
+++ b/vllm/distributed/kv_events.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import queue
 import threading
diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py
index 8b6abf5a8..fa9b7e4f1 100644
--- a/vllm/distributed/kv_transfer/__init__.py
+++ b/vllm/distributed/kv_transfer/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.distributed.kv_transfer.kv_transfer_state import (
     KVConnectorBaseType, ensure_kv_transfer_initialized, get_kv_transfer_group,
diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py
index e9b70610e..181c33925 100644
--- a/vllm/distributed/kv_transfer/kv_connector/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 KVConnectorBase Class for Distributed KV Cache & Hidden State communication
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index dce0b545c..58dfa251c 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import importlib
 from typing import TYPE_CHECKING, Callable
diff --git a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
index d121cb701..78bf30956 100644
--- a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 LMCache KV Cache Connector for Distributed Machine Learning Inference
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
index 58eabd0a3..94a7ce91a 100644
--- a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 MooncakeStore Connector for Distributed Machine Learning Inference
 The MooncakeStoreConnector transfers KV caches between prefill vLLM workers
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index ed8fe3816..e7c079e1f 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Simple KV Cache Connector for Distributed Machine Learning Inference
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index b1c9c9af6..c62444e75 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 KV cache helper for store.
 """
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
index e66aaa7f8..f00f31dde 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1, KVConnectorRole)
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index bc9258e9d..8f9d70eec 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 KVConnectorBase_V1 Class for Distributed KV Cache & Hidden State
 communication in vLLM v1
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
index 2cb68dc1f..cc1f4ba35 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import TYPE_CHECKING
 
 import torch
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index 0aabb260f..5aab10b2b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 4d228dbc9..3f0b0e295 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import math
 import threading
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 0421a65a2..f86b92692 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 import os
 from dataclasses import dataclass
diff --git a/vllm/distributed/kv_transfer/kv_connector_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py
index 819c06805..8633fdaf5 100644
--- a/vllm/distributed/kv_transfer/kv_connector_agent.py
+++ b/vllm/distributed/kv_transfer/kv_connector_agent.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A centralized entrypoint to perform distributed KV cache transfer.
 
 This implementation is a shim wrapper on two APIs exposed by `kv_connector`:
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
index d1ffb8092..eef14269f 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file contains a new class `KVLookupBufferBase` that allows developers to
 think of KV cache operations as inserting new KV cache entries (`insert`)
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
index 5bb711021..4381aad1e 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file contains a new class `MooncakeStore` that allows developers to
 think of KV cache transfer operations as putting new KV cache entries
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index e3b2274bd..a0ff7c320 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
     Implements a distributed key-value (KV) cache transfer mechanism.
 
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
index 40589fb3e..1423fd032 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file defines an interface `KVPipeBase`
 that provides an abstraction for sending and receiving tensors, or None, via
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
index aa4b1ba71..9f3494b81 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import os
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
index 761c56f7e..09de0b682 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
     This module implements a PyNccl pipe for sending and receiving
     Optional[torch.Tensor] between distributed ranks with advanced
diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py
index 25d2f2cf5..60f1d5d8b 100644
--- a/vllm/distributed/kv_transfer/kv_transfer_state.py
+++ b/vllm/distributed/kv_transfer/kv_transfer_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import TYPE_CHECKING, Optional
 
 from vllm import envs
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 32c9301bf..10f87c49b 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 The vLLM team.
 # Adapted from
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 96d08dc1a..67f71643d 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 The vLLM team.
 # Adapted from
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e3b8a18cc..587a23134 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 import argparse
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 19b219b67..6d8d97cf5 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import copy
diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py
index 94674262b..28a023a71 100644
--- a/vllm/engine/async_timeout.py
+++ b/vllm/engine/async_timeout.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Workaround for https://github.com/python/cpython/issues/86296
 #
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index a9600a2c8..dbcf78f02 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 import time
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 916afe0c8..8d51f0472 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from typing import TYPE_CHECKING
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
index acc83011d..9375dc4c4 100644
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 These types are defined in this file to avoid importing vllm.engine.metrics
 and therefore importing prometheus_client.
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index af72c8e6b..bf9f66903 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import uuid
 from dataclasses import dataclass, field
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 18b7c187b..f2f442485 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import copy
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 434cb4985..ef088bd39 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pickle
 import signal
diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py
index 4c8e295c1..19c5963d3 100644
--- a/vllm/engine/output_processor/interfaces.py
+++ b/vllm/engine/output_processor/interfaces.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Callable, List
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 110f84a65..e0fa6a00e 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import functools
 from typing import Callable, List, cast
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index e88f119c8..dbf6a371d 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List
 
diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py
index 6cad9ec8f..7925d91f6 100644
--- a/vllm/engine/output_processor/stop_checker.py
+++ b/vllm/engine/output_processor/stop_checker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, List, Optional, Tuple
 
diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py
index 0d2b58c10..1e127eb98 100644
--- a/vllm/engine/output_processor/util.py
+++ b/vllm/engine/output_processor/util.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List
 from typing import Sequence as GenericSequence
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 28341c2c6..727d59283 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from abc import ABC, abstractmethod
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 1c0271811..56f8754c2 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 NOTE: This API server is used only for demonstrating usage of AsyncEngine
 and simple performance benchmarks. It is not intended for production use.
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index b051cd333..95c806c22 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import json
diff --git a/vllm/entrypoints/cli/benchmark/base.py b/vllm/entrypoints/cli/benchmark/base.py
index 94fb415f5..30a884410 100644
--- a/vllm/entrypoints/cli/benchmark/base.py
+++ b/vllm/entrypoints/cli/benchmark/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 
 from vllm.entrypoints.cli.types import CLISubcommand
diff --git a/vllm/entrypoints/cli/benchmark/latency.py b/vllm/entrypoints/cli/benchmark/latency.py
index 5aca16e0b..e0358a262 100644
--- a/vllm/entrypoints/cli/benchmark/latency.py
+++ b/vllm/entrypoints/cli/benchmark/latency.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 
 from vllm.benchmarks.latency import add_cli_args, main
diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py
index 9e857af7d..717da630a 100644
--- a/vllm/entrypoints/cli/benchmark/main.py
+++ b/vllm/entrypoints/cli/benchmark/main.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 
 import vllm.entrypoints.cli.benchmark.latency
diff --git a/vllm/entrypoints/cli/benchmark/serve.py b/vllm/entrypoints/cli/benchmark/serve.py
index d5a858920..304370157 100644
--- a/vllm/entrypoints/cli/benchmark/serve.py
+++ b/vllm/entrypoints/cli/benchmark/serve.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 
 from vllm.benchmarks.serve import add_cli_args, main
diff --git a/vllm/entrypoints/cli/benchmark/throughput.py b/vllm/entrypoints/cli/benchmark/throughput.py
index 88ee6aa03..20431cd3d 100644
--- a/vllm/entrypoints/cli/benchmark/throughput.py
+++ b/vllm/entrypoints/cli/benchmark/throughput.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 
 from vllm.benchmarks.throughput import add_cli_args, main
diff --git a/vllm/entrypoints/cli/collect_env.py b/vllm/entrypoints/cli/collect_env.py
index 810ecfdf7..141aafdb1 100644
--- a/vllm/entrypoints/cli/collect_env.py
+++ b/vllm/entrypoints/cli/collect_env.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 
diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py
index 5eba72fec..3e834b3b2 100644
--- a/vllm/entrypoints/cli/main.py
+++ b/vllm/entrypoints/cli/main.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # The CLI entrypoint to vLLM.
 import signal
diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py
index 215fcf3c3..58dcdfe21 100644
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Commands that act as an interactive OpenAI API client
 
 import argparse
diff --git a/vllm/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py
index f74c8da9b..353034f88 100644
--- a/vllm/entrypoints/cli/run_batch.py
+++ b/vllm/entrypoints/cli/run_batch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import asyncio
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 040ae166a..f9c56e655 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import os
diff --git a/vllm/entrypoints/cli/types.py b/vllm/entrypoints/cli/types.py
index f739a68c5..0a7244312 100644
--- a/vllm/entrypoints/cli/types.py
+++ b/vllm/entrypoints/cli/types.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index a4f70a51e..9f4dc19fb 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import signal
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index e05189ef4..fd28bf39e 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 import warnings
diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py
index d4655dd5e..f3aee188d 100644
--- a/vllm/entrypoints/logger.py
+++ b/vllm/entrypoints/logger.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 5a4295ff7..2f8819bca 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import atexit
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index f196ff6ed..ca70e78df 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file contains the command line arguments for the vLLM's
 OpenAI-compatible server. It is kept in a separate file for documentation
diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
index 04d5091a9..29d72256c 100644
--- a/vllm/entrypoints/openai/logits_processors.py
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from functools import lru_cache, partial
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index e72c23993..ecfcc0068 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index ac250b3cb..9994b3cae 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import tempfile
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index ea8e187dc..7e514d660 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import json
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index 90cdd389d..3ac4f01ea 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from http import HTTPStatus
 from typing import Optional, Union, cast
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 1c06070cb..ce5eca855 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import time
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 3785d2642..e87decfe6 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 from typing import Final, Literal, Optional, Union, cast
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index f96a4ac8b..ac3883bde 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import base64
 import io
 import json
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 74433a1a3..764b0e736 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import pathlib
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 7c401d4f5..b896cc46b 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import base64
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 9bdacb551..f58611c49 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import time
 from collections.abc import AsyncGenerator, Mapping
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 0d739bbf9..3db0a71fa 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Final, Optional, Union
 
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
index 9fc5b562e..f667c7e9b 100644
--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import io
 import time
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 054c0b006..3e4f4e149 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .abstract_tool_parser import ToolParser, ToolParserManager
 from .deepseekv3_tool_parser import DeepSeekV3ToolParser
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index 931d5aab9..02aeab613 100644
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
index 14e743e13..60025af2a 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from typing import Union
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index 383e0d44d..5508ba6a3 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index b8bf14253..fcc5b7edd 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 2b9f9852b..c7030d34d 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index 3f2799f80..e5dcdf9a0 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index 2714a545f..66b483d8b 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
index 323fb1441..6bf44a434 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import ast
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 4eda7044c..5698bc70a 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index fecad7e65..ef5b14f3c 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
index 00690ad79..5501028cf 100644
--- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index bc5d15dcb..73329cdf7 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ast
 import json
diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py
index acbff3258..aa41cd6dc 100644
--- a/vllm/entrypoints/openai/tool_parsers/utils.py
+++ b/vllm/entrypoints/openai/tool_parsers/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from json import JSONDecodeError, JSONDecoder
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 80b6c07c6..c4e044f3a 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Union
 
 from torch.nn import CosineSimilarity
diff --git a/vllm/entrypoints/ssl.py b/vllm/entrypoints/ssl.py
index dba916b8b..e3646a60a 100644
--- a/vllm/entrypoints/ssl.py
+++ b/vllm/entrypoints/ssl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from ssl import SSLContext
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 1b0ea6909..6fb32ff18 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import functools
diff --git a/vllm/env_override.py b/vllm/env_override.py
index 71f031d1e..b0a061d2c 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import torch
diff --git a/vllm/envs.py b/vllm/envs.py
index 3dd0d9045..2e3d6eeb5 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
 import os
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 40ca1d299..99e12201c 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import time
diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py
index d1f8c36fb..4e8c6d790 100644
--- a/vllm/executor/mp_distributed_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py
index e680d53cb..852c8f5cf 100644
--- a/vllm/executor/msgspec_utils.py
+++ b/vllm/executor/msgspec_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from array import array
 from typing import Any, Type
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index 380b672c3..a6c172bef 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 8e67c7a41..bdc2b1f4c 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import json
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 7bc98a16f..c222f1609 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import time
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index 1d3a6e443..7ebeb4a22 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index f192be1c4..f3b0518a4 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from collections import defaultdict
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index df4f844cd..37bf2b7a4 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .data import (DecoderOnlyInputs, EmbedsInputs, EncoderDecoderInputs,
                    ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 843c45bd6..23cb5e502 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast
 
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index 4c64a41ac..8c3700799 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
 from typing import Literal, Optional, TypedDict, Union, cast, overload
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index b9acabeab..a13e563f3 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from collections.abc import Mapping
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index f424a8f61..73d19aecd 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
diff --git a/vllm/jsontree.py b/vllm/jsontree.py
index 91cd7cb21..4cbe0f76e 100644
--- a/vllm/jsontree.py
+++ b/vllm/jsontree.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Helper functions to work with nested JSON structures."""
 from collections.abc import Iterable
 from functools import reduce
diff --git a/vllm/logger.py b/vllm/logger.py
index fd16dd95b..0ddb83cb8 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Logging configuration for vLLM."""
 import datetime
 import json
diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py
index 7ab463258..cf690a89a 100644
--- a/vllm/logging_utils/__init__.py
+++ b/vllm/logging_utils/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.logging_utils.formatter import NewLineFormatter
 
diff --git a/vllm/logging_utils/dump_input.py b/vllm/logging_utils/dump_input.py
index 47ce0ab18..d14515f56 100644
--- a/vllm/logging_utils/dump_input.py
+++ b/vllm/logging_utils/dump_input.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import enum
diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py
index 010b0a124..0affef100 100644
--- a/vllm/logging_utils/formatter.py
+++ b/vllm/logging_utils/formatter.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 
diff --git a/vllm/logits_process.py b/vllm/logits_process.py
index 29a73656b..5967d0836 100644
--- a/vllm/logits_process.py
+++ b/vllm/logits_process.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Union
 
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index b6b138a44..7fc4cfe02 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # pylint: disable=unused-argument
 from typing import TYPE_CHECKING, Optional, Union, cast
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 023c8e9c9..66e037a97 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # pylint: disable=unused-argument
 import math
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index 294b49e0a..958364fca 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence as GenericSequence
 from typing import Optional
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index dfdc908d7..262e67995 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 import os
diff --git a/vllm/lora/ops/torch_ops/__init__.py b/vllm/lora/ops/torch_ops/__init__.py
index 85601d58c..22aa3c63d 100644
--- a/vllm/lora/ops/torch_ops/__init__.py
+++ b/vllm/lora/ops/torch_ops/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand  # noqa: F401
 from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink,
diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py
index ab65faceb..cba5baad8 100644
--- a/vllm/lora/ops/torch_ops/lora_ops.py
+++ b/vllm/lora/ops/torch_ops/lora_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py
index 5a39705e8..805de4b6f 100644
--- a/vllm/lora/ops/triton_ops/__init__.py
+++ b/vllm/lora/ops/triton_ops/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand
 from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
diff --git a/vllm/lora/ops/triton_ops/kernel_utils.py b/vllm/lora/ops/triton_ops/kernel_utils.py
index 0f971c035..e93064d0c 100644
--- a/vllm/lora/ops/triton_ops/kernel_utils.py
+++ b/vllm/lora/ops/triton_ops/kernel_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Utilities for Punica kernel construction.
 """
diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py
index 9feb9e462..9e1f90e75 100644
--- a/vllm/lora/ops/triton_ops/lora_expand_op.py
+++ b/vllm/lora/ops/triton_ops/lora_expand_op.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
diff --git a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
index ac459a832..39e647b9b 100644
--- a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
+++ b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 LoRA kernels metadata preparation utilities.
 """
diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py
index c3871bd58..3f9edfc6d 100644
--- a/vllm/lora/ops/triton_ops/lora_shrink_op.py
+++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
index 6225635c2..5857f7fec 100644
--- a/vllm/lora/ops/triton_ops/utils.py
+++ b/vllm/lora/ops/triton_ops/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/lora/ops/xla_ops/__init__.py b/vllm/lora/ops/xla_ops/__init__.py
index 94062b05d..7e7c3c892 100644
--- a/vllm/lora/ops/xla_ops/__init__.py
+++ b/vllm/lora/ops/xla_ops/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.lora.ops.xla_ops.lora_ops import (bgmv_expand, bgmv_expand_slice,
                                             bgmv_shrink)
diff --git a/vllm/lora/ops/xla_ops/lora_ops.py b/vllm/lora/ops/xla_ops/lora_ops.py
index dff4d5181..9118f3351 100644
--- a/vllm/lora/ops/xla_ops/lora_ops.py
+++ b/vllm/lora/ops/xla_ops/lora_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import jax
 import jax.numpy as jnp
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index 7d335e5f7..a20d73f0f 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py
 
diff --git a/vllm/lora/punica_wrapper/__init__.py b/vllm/lora/punica_wrapper/__init__.py
index 915fc6623..e664ffa1d 100644
--- a/vllm/lora/punica_wrapper/__init__.py
+++ b/vllm/lora/punica_wrapper/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
 from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index e03f73290..5b4902dcb 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py
index 8118a72d6..59049cccc 100644
--- a/vllm/lora/punica_wrapper/punica_cpu.py
+++ b/vllm/lora/punica_wrapper/punica_cpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional, Union
 
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index 224640ec7..6b038309d 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py
index 416c23e73..b20c9785a 100644
--- a/vllm/lora/punica_wrapper/punica_hpu.py
+++ b/vllm/lora/punica_wrapper/punica_hpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Optional, Union, final
 
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index 922d6c060..c684ac77c 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py
index 0556e583f..6b48268c5 100644
--- a/vllm/lora/punica_wrapper/punica_tpu.py
+++ b/vllm/lora/punica_wrapper/punica_tpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from typing import TYPE_CHECKING, Optional, Union
diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py
index 1adb40b4c..0b0a7989f 100644
--- a/vllm/lora/punica_wrapper/utils.py
+++ b/vllm/lora/punica_wrapper/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Optional, Union
 
diff --git a/vllm/lora/request.py b/vllm/lora/request.py
index 616e94f8d..5bbba7830 100644
--- a/vllm/lora/request.py
+++ b/vllm/lora/request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
 from typing import Optional
diff --git a/vllm/lora/resolver.py b/vllm/lora/resolver.py
index 33f35322f..5808ae105 100644
--- a/vllm/lora/resolver.py
+++ b/vllm/lora/resolver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from collections.abc import Set
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 619dd3bdc..ee196e3f6 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional, Union
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index f1ae03097..7da44569f 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
 from typing import Any, Literal, Optional, Union
diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py
index 763615217..55dfe8088 100644
--- a/vllm/model_executor/__init__.py
+++ b/vllm/model_executor/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            PackedvLLMParameter)
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index acf722467..7e6cdd987 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch.nn as nn
 
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index a2b61a1b1..3c2998bec 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py
index 58adcc3ca..05b6a1c32 100644
--- a/vllm/model_executor/guided_decoding/guidance_decoding.py
+++ b/vllm/model_executor/guided_decoding/guidance_decoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 
 import llguidance
diff --git a/vllm/model_executor/guided_decoding/guidance_logits_processors.py b/vllm/model_executor/guided_decoding/guidance_logits_processors.py
index e17df68b4..379b5eaa3 100644
--- a/vllm/model_executor/guided_decoding/guidance_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/guidance_logits_processors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 import os
 from typing import Any
diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py
index 316860718..fa97b6dbf 100644
--- a/vllm/model_executor/guided_decoding/guided_fields.py
+++ b/vllm/model_executor/guided_decoding/guided_fields.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Optional, TypedDict, Union
diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
index 7eaf9e38e..f9b51f4c1 100644
--- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
+++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import lru_cache
 from json import loads as json_loads
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index e41af4b36..26c2d958e 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import concurrent.futures
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 6986b6554..4ef4db7c4 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024- the Outlines developers
 # This file is adapted from
diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
index 3f77cf394..8fdfa983e 100644
--- a/vllm/model_executor/guided_decoding/utils.py
+++ b/vllm/model_executor/guided_decoding/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import regex as re
 
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index d2e568609..bdd3a1a9c 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # noqa: UP007
 from __future__ import annotations
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index a32c26317..cc9c8d445 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom activation functions."""
 import math
 from typing import Optional
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 5c262287f..2bdc96e29 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
 from typing import Any, Optional
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 26a433da2..d827869d0 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """ CUTLASS based Fused MoE kernels."""
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 46a814e6e..331544d64 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 import importlib.util
 from typing import Optional
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index c2db79365..205a95e7f 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused batched MoE kernel."""
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 4c84dd538..40b76994f 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused MoE utilities for GPTQ."""
 import functools
 from typing import Optional
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 78f8eb926..883a48c98 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused MoE kernel."""
 import functools
 import json
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 1e193c909..3ce4cbc28 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import importlib
 from abc import abstractmethod
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 7d3ddf8f1..5e321c9b4 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
index d025f1257..98e175b12 100644
--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
index 9d8bd62c6..d35bd0098 100644
--- a/vllm/model_executor/layers/fused_moe/moe_pallas.py
+++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.nn.functional as F
diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
index cb396f26c..da7871434 100644
--- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
+++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
index da27633f2..6160da732 100644
--- a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
+++ b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.nn.functional as F
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index 783ebebbf..8405603cf 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import pplx_kernels as pplx
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index 98f98b3bd..77a9686c9 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 824062491..d44989cce 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from enum import IntEnum
 from functools import cache
 from typing import Optional
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index 2cfe37314..373e8ab39 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index d9d2520e1..c3a584782 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from math import prod
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index e8abd32ff..b3c65e341 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom normalization layers."""
 from typing import Optional, Union
 
diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py
index 96659af40..978086d19 100644
--- a/vllm/model_executor/layers/lightning_attn.py
+++ b/vllm/model_executor/layers/lightning_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 from einops import rearrange
 
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 269ac043d..588aa8deb 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from abc import abstractmethod
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 6b69a2608..3d0125344 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A layer that compute logits from hidden_stats."""
 import inspect
 from concurrent.futures import ThreadPoolExecutor
diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py
index 019f634a9..88053faf9 100644
--- a/vllm/model_executor/layers/mamba/mamba2_metadata.py
+++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from dataclasses import dataclass
 
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 156e8752e..118bd8d55 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 from torch import nn
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index f94ab75f9..6d9ea5387 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
index 21e27160f..a10c5ab69 100644
--- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao.
 # Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py
diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
index 689c940d1..ccfb278cd 100644
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/selective_state_update.py
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_bmm.py b/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
index 0fdb055aa..11ca1255e 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_bmm.py
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
index 1652c5181..365e1c54b 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_scan.py
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
index ee6335690..58bfb661d 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_state.py
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
index 79a1663b8..b121275e9 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_combined.py
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
index 6f69ca743..a28fc9ffa 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_state_passing.py
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index d2c42191b..258038bed 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from enum import IntEnum
 from typing import Optional, Union
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 407b9c72f..1cb23e7a1 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Literal, get_args
 
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 8bf0ca5c0..2ea8c5dc5 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Supports AQLM compression, see https://github.com/Vahe1994/AQLM
 # and https://arxiv.org/pdf/2401.06118.pdf
diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index eb8ffa378..ea17cd56c 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from fractions import Fraction
 from typing import Any, Optional, Union
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 87afdb623..f8bc3ab5e 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 0c8d082bb..56d803c6b 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py
index 5e5491578..ebc526d6d 100644
--- a/vllm/model_executor/layers/quantization/awq_triton.py
+++ b/vllm/model_executor/layers/quantization/awq_triton.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index c9533da9d..78c5c75c0 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import inspect
 from abc import ABC, abstractmethod
diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py
index 1cd12bb76..9e5ce39ec 100644
--- a/vllm/model_executor/layers/quantization/bitblas.py
+++ b/vllm/model_executor/layers/quantization/bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 049ce7a71..38935bc96 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 27547f315..dff62af86 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import suppress
 from typing import Any, Literal, Optional, cast
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 9241ceeb4..ebb029572 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 from enum import Enum
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 79bf5c108..25924c733 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .compressed_tensors_scheme import CompressedTensorsScheme
 from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index f010bc034..30ed55aee 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
index daa25d23a..a5d48f235 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index 6ea31e50c..3f3e7668f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
index cf60b34ba..8202ce951 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Callable, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
index 61e4918ca..01a87a088 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 99bb73b71..1e61e058c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 7792ce865..6189f0609 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index a33c58acb..74787603e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
index 2380d3570..9bcf1aa2b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 75e81c4dd..402646498 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Mapping
 from types import MappingProxyType
diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py
index 0c1eaff93..8030be525 100644
--- a/vllm/model_executor/layers/quantization/deepspeedfp.py
+++ b/vllm/model_executor/layers/quantization/deepspeedfp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 3601d219d..01b0064f0 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 223682ee9..3e465ee2c 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index ac9b74945..cea4d26a4 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import functools
 import importlib.util
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 1fcb6d7af..2171f729a 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 436f1e3cc..d3ab1be3b 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 from enum import Enum
diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py
index be9510abd..78e0f59fa 100644
--- a/vllm/model_executor/layers/quantization/gptq_bitblas.py
+++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index cf012e145..f92ebdea9 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional, Union
 
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
index e90416f37..eba917d85 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
index a8faf9772..ee8a0e34b 100644
--- a/vllm/model_executor/layers/quantization/hqq_marlin.py
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 8108c7976..31ad96ecc 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
index 55ad00b1c..07ecc0962 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index bb1dc40ad..0bf0d530d 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
index e07177dd6..785e559df 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
index 29e206991..649d07b4d 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
index 50d293cf4..fef333e86 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
index 855867fa4..c7c458618 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import partial
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index 899011f00..1597492a5 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
index 2d92af74b..9ebf5f303 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index 5d58c0489..18f5ce04f 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
index 6c2c464e6..165548a06 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
index 98a0b30be..6ddd4a9ec 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
index c09ca83d0..817565cf2 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
index a97b53b9d..3de28af40 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index 67723c7c9..e5604670f 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index 2437030c8..62667db26 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 2abe16a08..3f79b203a 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional, Union
 
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 74bd6dc13..3aa23f068 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py
index b2d6bf5db..804023666 100644
--- a/vllm/model_executor/layers/quantization/neuron_quant.py
+++ b/vllm/model_executor/layers/quantization/neuron_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from importlib.util import find_spec
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index 9e4fb3363..32ba1055f 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py
index 6028b8a2a..25978cb13 100644
--- a/vllm/model_executor/layers/quantization/qqq.py
+++ b/vllm/model_executor/layers/quantization/qqq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index df4bfbbbc..6ae5f5c9a 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import fnmatch
 from typing import Any, Optional, cast
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index aa7d72543..4c2da4c8b 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
index d7dac1757..ec09d9b2a 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .quark_scheme import QuarkScheme
 from .quark_w4a4_mxfp4 import QuarkW4A4MXFP4
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
index 40c8ea86d..c167e949a 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
index 34c077b29..3c56251b7 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
index 149c90937..47e0a492b 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
index 94f9fcd56..ae68d5bbc 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py
index 5e56bcb75..99f5ec159 100644
--- a/vllm/model_executor/layers/quantization/quark/utils.py
+++ b/vllm/model_executor/layers/quantization/quark/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Mapping
 from types import MappingProxyType
diff --git a/vllm/model_executor/layers/quantization/schema.py b/vllm/model_executor/layers/quantization/schema.py
index c0be40c16..a10815292 100644
--- a/vllm/model_executor/layers/quantization/schema.py
+++ b/vllm/model_executor/layers/quantization/schema.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file contains the Pydantic schemas for various quantization-related
 parameters. When a relevant quantization technique is specified, these
diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py
index 7f9f3e643..af362f7a7 100644
--- a/vllm/model_executor/layers/quantization/torchao.py
+++ b/vllm/model_executor/layers/quantization/torchao.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py
index 7941ec973..83c8a98ea 100644
--- a/vllm/model_executor/layers/quantization/tpu_int8.py
+++ b/vllm/model_executor/layers/quantization/tpu_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/utils/__init__.py b/vllm/model_executor/layers/quantization/utils/__init__.py
index f7ee47288..6ad56bae3 100644
--- a/vllm/model_executor/layers/quantization/utils/__init__.py
+++ b/vllm/model_executor/layers/quantization/utils/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .layer_utils import replace_parameter, update_tensor_inplace
 
diff --git a/vllm/model_executor/layers/quantization/utils/allspark_utils.py b/vllm/model_executor/layers/quantization/utils/allspark_utils.py
index 97860765a..1992b4d20 100644
--- a/vllm/model_executor/layers/quantization/utils/allspark_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/allspark_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
index 70d24cc89..82ee3edfd 100644
--- a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 4c213f2c8..1ebd2a898 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/pull/2575
 import functools
diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
index 36161d13b..db82b0def 100644
--- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from copy import deepcopy
 from typing import Optional, Union
 
diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py
index 72fff3fa1..a694a1917 100644
--- a/vllm/model_executor/layers/quantization/utils/int8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/blob/4cb53ecd0cffceb6dee5c011a58f65997a86f151/python/sglang/srt/layers/quantization/int8_kernel.py
 import functools
diff --git a/vllm/model_executor/layers/quantization/utils/layer_utils.py b/vllm/model_executor/layers/quantization/utils/layer_utils.py
index 5acae7ca3..fbc0f23ac 100644
--- a/vllm/model_executor/layers/quantization/utils/layer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/layer_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Union
 
diff --git a/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py
index 6d840b568..580c36a0e 100644
--- a/vllm/model_executor/layers/quantization/utils/machete_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/machete_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index e059a7ac3..7540a1516 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index 13dcdc00a..ca10db69d 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 1f6e74244..5372c49d9 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index 81112b27f..b2c228c24 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility functions used for tests and benchmarks"""
 
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
index 73feb4264..1c93c3646 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility functions used for tests and benchmarks"""
 
 import random
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
index 0123540fc..8a64bebae 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import numpy
 import torch
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index e7c95e38e..9d4a188f5 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
index f29220831..6e8e98d54 100644
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
 __all__ = [
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 6ba327f3d..d6b96774b 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This file is used for /tests and /benchmarks"""
 from collections.abc import Mapping
 from types import MappingProxyType
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index eed8998fe..adc67aa64 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional, Union
 
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 3db734958..a6e58a77d 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import cached_property
 from importlib.util import find_spec
diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py
index 839688e31..3f2d57177 100644
--- a/vllm/model_executor/layers/resampler.py
+++ b/vllm/model_executor/layers/resampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index afc059719..9de233896 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 32375db0c..08840fc40 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A layer that samples the next tokens from the model's outputs."""
 import itertools
 from collections.abc import Iterator
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
index 969cd59b5..0a36fe9be 100644
--- a/vllm/model_executor/layers/spec_decode_base_sampler.py
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import abstractmethod
 from typing import Optional, Union
diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py
index a14c86148..5dabaa537 100644
--- a/vllm/model_executor/layers/typical_acceptance_sampler.py
+++ b/vllm/model_executor/layers/typical_acceptance_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.jit
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index 001e6aaf0..d97d84238 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility methods for model layers."""
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 46d2075af..0f636d83a 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from dataclasses import dataclass
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index a443a652d..f36437103 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py
index d619d9f25..5018c7d9a 100644
--- a/vllm/model_executor/model_loader/base_loader.py
+++ b/vllm/model_executor/model_loader/base_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 
 import torch
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index 3df835a93..ebbb021ca 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: SIM117
 import fnmatch
 import glob
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 6946627a5..4624ff01d 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import dataclasses
 import glob
 import os
diff --git a/vllm/model_executor/model_loader/dummy_loader.py b/vllm/model_executor/model_loader/dummy_loader.py
index 64fa2be76..f4a7da574 100644
--- a/vllm/model_executor/model_loader/dummy_loader.py
+++ b/vllm/model_executor/model_loader/dummy_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch.nn as nn
 
 from vllm.config import LoadConfig, ModelConfig
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index 1eac50422..203c80760 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from collections.abc import Generator
 
diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py
index e65d16cae..fad97aba8 100644
--- a/vllm/model_executor/model_loader/neuron.py
+++ b/vllm/model_executor/model_loader/neuron.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utilities for selecting and loading Neuron models in transformers-neuronx
 framework."""
 import ast
diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py
index 72ad4da29..f450961c6 100644
--- a/vllm/model_executor/model_loader/neuronx_distributed.py
+++ b/vllm/model_executor/model_loader/neuronx_distributed.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utilities for selecting and loading Neuron models in
 neuronx-distributed-inference framework."""
 # Disabling yapf because yapf and isort have conflicts for the below imports
diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py
index a39e26c6d..83e0f386c 100644
--- a/vllm/model_executor/model_loader/runai_streamer_loader.py
+++ b/vllm/model_executor/model_loader/runai_streamer_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: SIM117
 import glob
 import os
diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py
index b5a5031bb..2fd9cfba3 100644
--- a/vllm/model_executor/model_loader/sharded_state_loader.py
+++ b/vllm/model_executor/model_loader/sharded_state_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import collections
 import glob
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 90c0bdf08..24d1e1365 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import contextlib
diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py
index 1923e040a..b9982f312 100644
--- a/vllm/model_executor/model_loader/tensorizer_loader.py
+++ b/vllm/model_executor/model_loader/tensorizer_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: SIM117
 import copy
 from collections.abc import Generator
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 9c8d647a2..e6eaade09 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utilities for selecting and loading models."""
 import contextlib
 import inspect
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 7a9a68be8..857f4bca6 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utilities for downloading and initializing model weights."""
 import fnmatch
 import glob
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 3580c4fa5..27c169d2d 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
                          SupportsPP, SupportsV0Only, has_inner_state,
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 6ab03c40a..1651e3e42 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Any, Optional, TypeVar
diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py
index 2e2a18abd..b13d863eb 100644
--- a/vllm/model_executor/models/aimv2.py
+++ b/vllm/model_executor/models/aimv2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # A modified implementation of the AIMv2 Transformer
 # inserted here also the image tokenizer used by Ovis2
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 94a432856..4693c9487 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Snowflake Arctic model."""
 from collections.abc import Iterable
 from typing import Optional, Union
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index f74e13888..bb4177dfc 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Optional, TypedDict, Union
 
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 08d49d71e..22efb707a 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -1,4 +1,5 @@
-# SPDX-License-Identifier: Apache-2.0 Adapted from
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project Adapted from
 # https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Literal, Optional, TypedDict, Union, cast
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index bcff6eb3f..0de5de5e8 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index d6a705fb1..29e0e2a2e 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Bamba model."""
 # Added by the IBM Team, 2024
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 92bbe1bb6..a0ec12674 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Derived from BART implementation posted on HuggingFace; license below:
 #
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 0b1d0f103..389393987 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index 8a387d71f..0f22393c7 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from copy import deepcopy
 from typing import Optional
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index acbc5d04d..2b457fd8a 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Minimal implementation of BlipVisionModel intended to be only used 
 within a vision language model."""
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index db0dd2051..279541bed 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Literal, Optional, TypedDict, Union
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 10424e218..6e4a399f3 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index a4528ca26..aea44261d 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 4e95afe1a..129f0942f 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from
 # https://github.com/THUDM/ChatGLM2-6B
 """Inference-only ChatGLM model compatible with THUDM weights."""
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 9fd528fd7..dcab00822 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Minimal implementation of CLIPVisionModel intended to be only used
 within a vision language model."""
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 546b5f932..ee67cc640 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/constant_size_cache.py b/vllm/model_executor/models/constant_size_cache.py
index f1cc7e0f9..f03c58a12 100644
--- a/vllm/model_executor/models/constant_size_cache.py
+++ b/vllm/model_executor/models/constant_size_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from typing import Any
 
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index f21887f71..7a4dd6944 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional, Union
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 88d1ca9f7..2f0202f1e 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 03ef7bed0..6e6e74b0d 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from typing import Optional
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index b78c193c1..0f996d04e 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 5c8793f59..765718e57 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py
 """Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index fb1675d29..221932145 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 838560692..aaf105ec2 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py
diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py
index 00dbbebb1..d78ee100b 100644
--- a/vllm/model_executor/models/fairseq2_llama.py
+++ b/vllm/model_executor/models/fairseq2_llama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved.
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 376793594..62a93dabd 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index 1c0e3911f..28f257eab 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only FalconH1 model."""
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index f8acc5670..47760aabb 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections import OrderedDict
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index fbad7f56d..cb141dbc5 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 0f6d94e75..99ed51f8e 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 The vLLM team.
 # Copyright (c) Google Inc.
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index b46716213..ce405041b 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 3a88adcce..e19e0026b 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright 2025 The vLLM team.
 # Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 182cc86d3..23e251707 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Any, Literal, Optional, TypedDict
diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py
index 6269ebcee..defa77b84 100644
--- a/vllm/model_executor/models/glm.py
+++ b/vllm/model_executor/models/glm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only HF format GLM-4 model compatible with THUDM weights."""
 from vllm.config import VllmConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index f351ce5a0..5e2908a82 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2025 The Zhipu AI team.
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 4e1371671..034c7654f 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/THUDM/CogAgent
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index c2c310fca..fd3decbae 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index c4ae4fc3c..661a67bdc 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 69fdd90cf..bd162a5e5 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 401fa9f5c..d418d8bb8 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 3524d036d..bd4d5d0b6 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index fd8fb48c5..831164ba8 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index f342dfff8..5a70f3a61 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 443b102c9..f434b7a74 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only GraniteMoeHybrid model."""
 # Added by the IBM Team, 2025
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index 817e6091d..bb160dbce 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only GraniteMoeShared model.
 
 The architecture is the same as granitemoe but with the addition of shared
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 6a444e8d1..4273afbf4 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from array import array
 from typing import Optional
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index bc9e9a3c0..2d930527b 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from
 # https://github.com/ROCm/vllm/blob/cea7419f151cc50293a05b7fac8547f8f887c9f6/vllm/model_executor/models/grok1.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 904f5330c..8f7f359b7 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
 # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index b8bdc7aa3..9e27200fb 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py
 # Copyright 2024 The vLLM team.
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index fdb128ef5..4bc5e2a0c 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 8be8841c1..cb2a4062b 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
                     Union, overload, runtime_checkable)
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index d325a6b67..4a1ea74a2 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import (TYPE_CHECKING, Optional, Protocol, Union, overload,
                     runtime_checkable)
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 538e9de4f..58e8163e0 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
 # --------------------------------------------------------
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 3f3e3966e..e8549b4e0 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from functools import partial
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index 6893d0239..4bbb49da0 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index c37d3afb4..0c61369c5 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
 # --------------------------------------------------------
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index d6a1e0bb4..bed4a5dff 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 6f9fa60c9..8294f846b 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Jamba model."""
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index b575f4476..351d1fbdc 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
 # Copyright 2025 The Moonshot AI Team, DeepSeek-AI, and HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d36b6466c..5d5080479 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 40fdd84d8..a852be66b 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team.
 # All rights reserved.
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index 172dc8b5e..f73b863fe 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index 1e40017fc..d31a321b8 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index ced71b6dc..725e1b2c1 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 2fb79f57a..6f5f23187 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 9303ea121..a3406d090 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 7ea759fd5..d90d3d4a0 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index ce76a76b6..8162ac3f7 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """PyTorch MAMBA model."""
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index 65c6467bc..cf9e1bd03 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """PyTorch MAMBA2 model."""
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py
index 47d0ef9cc..49ba974c6 100644
--- a/vllm/model_executor/models/mamba_cache.py
+++ b/vllm/model_executor/models/mamba_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 
diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
index 95ef1134b..709a5a993 100644
--- a/vllm/model_executor/models/medusa.py
+++ b/vllm/model_executor/models/medusa.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py
index 49ea64e02..9b83f848e 100644
--- a/vllm/model_executor/models/mimo.py
+++ b/vllm/model_executor/models/mimo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py
index cbca6a4c8..6066ec76c 100644
--- a/vllm/model_executor/models/mimo_mtp.py
+++ b/vllm/model_executor/models/mimo_mtp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/model_executor/models/deepseek_mtp.py
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index f471a86ff..d398a5d12 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index 2a6867d12..92c13e81b 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
index 039c3d22d..06c2eb4e8 100644
--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index ae5df0f92..ff5959ed1 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 04cc7e35e..4100fee0e 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/minimax_cache.py b/vllm/model_executor/models/minimax_cache.py
index c95cbb419..9164ac06a 100644
--- a/vllm/model_executor/models/minimax_cache.py
+++ b/vllm/model_executor/models/minimax_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
 import torch
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index ac0fe7b10..02800449b 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only MiniMaxText01 model."""
 import copy
 import math
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
index 14c1250ca..b2ededcaf 100644
--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping
 from typing import Literal, Optional, TypedDict, Union, cast
 
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 051a73120..9147240b2 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 9bc7a1615..dec365119 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 8220200d2..3183c762d 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 713c9e8d2..e9f91feb3 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 58549b10e..54fae279d 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team.
 # All rights reserved.
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index a7d7aa7d4..c6a97388d 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 18eab6051..35f416a6e 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from typing import Optional
 
diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py
index 25e6f5940..11a2a384c 100644
--- a/vllm/model_executor/models/module_mapping.py
+++ b/vllm/model_executor/models/module_mapping.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 #  https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 640a2049a..1fa76b9ac 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py
index 9f11d4a42..d0fdab13e 100644
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
 # This file is meant to be used in kimi_vl.py only
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 6c396d778..0878ada34 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
 import math
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index d0999e30e..eabf47b1a 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 9808fe055..a766ed947 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 172434e66..2f7f8e437 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py
 # --------------------------------------------------------
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index fcb7c619a..1dc4df85c 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 33adacdae..499e6d30e 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index af2894555..ebfdb690f 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 8376d6241..9eaac1e28 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index da2a194e6..d121188ba 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index 232a63c50..5c11d54c6 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/ovis/modeling_ovis.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 427005e9b..a0e291257 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Literal, Optional, TypedDict, Union
 
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index d46b95fea..f8db99eb9 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 330ad5c59..21d517b3a 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py
diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py
index 8f84e0726..f4e870c53 100644
--- a/vllm/model_executor/models/phi3.py
+++ b/vllm/model_executor/models/phi3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from llama.py
 """Inference-only Phi3 model code inherit from Llama.py"""
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index d00d7d886..533655fd5 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index b757e661d..376c53d2c 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 418ff900f..924e64368 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Any, Literal, Optional, TypedDict, Union
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
index 98cef7506..ae7a8a732 100644
--- a/vllm/model_executor/models/phi4mm_audio.py
+++ b/vllm/model_executor/models/phi4mm_audio.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 # Code copied from Microsoft/MoE by Jacob Platin (jacobplatin@microsoft.com)
diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py
index f468fdbd5..c4890d842 100644
--- a/vllm/model_executor/models/phi4mm_utils.py
+++ b/vllm/model_executor/models/phi4mm_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 # Code copied from Microsoft/MoE by Jacob Platin (jacobplatin@microsoft.com)
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index d9917c26d..dddd19c74 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 9f28d4cef..705586b6a 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 55a65f807..670576c68 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only PLaMo2 model."""
 import math
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 40ac5e30a..4fdcae5de 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2025 The vLLM team.
 # Copyright 2025 IBM.
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 2fda87a4f..e804f03e0 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index a664864ff..23f65b99c 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index d89b822dd..7172394e4 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index f62c7e1d2..7770ec711 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 3182a7532..6951630c6 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 143b9f98b..a2c65f4b5 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 81dc38988..76d7ecdd1 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 5c30e36c7..a4f8a361e 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index dbe2be8a7..393ce41a9 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 8a4c2850d..823197fc9 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index f5d242fdf..e828ce9c9 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/Qwen/Qwen-VL/blob/main/modeling_qwen.py
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index fcef457a7..57d1b7c53 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Whenever you add an architecture to this page, please also update
 `tests/models/registry.py` with example HuggingFace models for it.
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 76008b729..8fa8b8979 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 4803da295..3630f59f5 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Implementation of SiglipVisionModel intended to be only used
 within a vision language model."""
 
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index eefadda91..08c47faca 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
 # --------------------------------------------------------
diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py
index 31dec5502..0f22ba5b4 100644
--- a/vllm/model_executor/models/smolvlm.py
+++ b/vllm/model_executor/models/smolvlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index fcd17cc1c..8dd52f1d2 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 86ce813dd..d6ec743ce 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team.
 # All rights reserved.
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index f4ba5a803..9d9a2bff0 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index 7d713d23c..f0b31b133 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py
index e05f23f99..3666f7011 100644
--- a/vllm/model_executor/models/teleflm.py
+++ b/vllm/model_executor/models/teleflm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index b87a2ebf2..2f78d9d4c 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The vLLM team.
 #
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index c1a4dc1b3..43836f295 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 3d821d3dc..aa88f4210 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from collections.abc import Iterable, Mapping
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 901d83ec5..ac6a659bb 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Final, Generic, Optional, Protocol, TypeVar, Union
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index c6e303d60..3ee5f7dba 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index 48e254bdd..a4f97c774 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """PyTorch Zamba2 model implementation for vLLM.
 
 This module implements the Zamba2 architecture from 
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 34a0b527b..750ee7850 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from fractions import Fraction
 from typing import Callable, Optional, Union
diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py
index 4c5db7396..4dd443bc2 100644
--- a/vllm/model_executor/pooling_metadata.py
+++ b/vllm/model_executor/pooling_metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 6b83a59b5..56f0f0984 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from array import array
 from dataclasses import dataclass
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 27cea6521..cbaa34bfc 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utils for model executor."""
 import copy
 from typing import Any, Optional
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 815e34d5a..2ef9f1ccc 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from .base import MultiModalPlaceholderMap
 from .hasher import MultiModalHashDict, MultiModalHasher
 from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 1fd2ab7f8..fbb29276f 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import base64
 from io import BytesIO
 from pathlib import Path
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 184c801e6..7188ed14c 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index b4cd6a908..b79883597 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pickle
 from collections.abc import Iterable, Mapping
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index a63ec0bd8..e673632d4 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 from io import BytesIO
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 600a34d39..35d2a6e8c 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 63af84274..cae62b223 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from collections import UserDict
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index aa7914e40..5cfca57bf 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import sys
 from abc import ABC, abstractmethod
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 53f5b243d..1faecb7bd 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index b9f5cee92..27aaa661c 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 1d838f66f..2b34cdf40 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import groupby
 from pathlib import Path
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 261d56aba..bedb9536e 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 from abc import abstractmethod
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 3960388bf..891305eb7 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from collections.abc import MutableSequence
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 00d00d05f..13453d2c4 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 import traceback
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index eaffaac78..2739f5c8c 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import sys
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 9f833cbb5..e2d9424de 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Code inside this file can safely assume cuda platform, e.g. importing
 pynvml. However, it should not initialize cuda context.
 """
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index a8dd7df9f..3cf289501 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import TYPE_CHECKING, Optional
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index c7a627262..1ec9c78a3 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import enum
 import os
 import platform
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 56f204e71..04e918d7a 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import enum
 import os
 from functools import lru_cache
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index ef1c632a5..a929366db 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from datetime import timedelta
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 0173b1569..07e52017f 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Optional, Union, cast
 
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index b2a6ad5d7..73f6f3d41 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Optional
 
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 4cd3552f8..2cb177b9b 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 import os
diff --git a/vllm/plugins/lora_resolvers/filesystem_resolver.py b/vllm/plugins/lora_resolvers/filesystem_resolver.py
index 219231f77..b999d07a6 100644
--- a/vllm/plugins/lora_resolvers/filesystem_resolver.py
+++ b/vllm/plugins/lora_resolvers/filesystem_resolver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import os
 from typing import Optional
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 9a3b254f9..322f9ed3e 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Any, Optional
 
diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py
index 6934d328a..2f9ebe531 100644
--- a/vllm/profiler/layerwise_profile.py
+++ b/vllm/profiler/layerwise_profile.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 from collections import defaultdict
diff --git a/vllm/profiler/utils.py b/vllm/profiler/utils.py
index b26fd4dd8..9f0f56a15 100644
--- a/vllm/profiler/utils.py
+++ b/vllm/profiler/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from typing import Callable, Union
diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py
index c2f9f1691..b5b925d04 100644
--- a/vllm/prompt_adapter/layers.py
+++ b/vllm/prompt_adapter/layers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Optional
diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py
index 795591606..864b50c86 100644
--- a/vllm/prompt_adapter/models.py
+++ b/vllm/prompt_adapter/models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 import math
diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py
index dfb8e61d7..3ce50d0a2 100644
--- a/vllm/prompt_adapter/request.py
+++ b/vllm/prompt_adapter/request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import msgspec
 
diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py
index dd179ab93..ddd007868 100644
--- a/vllm/prompt_adapter/utils.py
+++ b/vllm/prompt_adapter/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420
 
diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py
index 28dcc1687..56265de80 100644
--- a/vllm/prompt_adapter/worker_manager.py
+++ b/vllm/prompt_adapter/worker_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 from typing import Any, Optional, Set, Type
diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py
index 65606ce55..e8cd56551 100644
--- a/vllm/reasoning/__init__.py
+++ b/vllm/reasoning/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
 from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index 9dd5191da..e827d381c 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py
index 1c283c092..1a5ca46a6 100644
--- a/vllm/reasoning/deepseek_r1_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from typing import Optional, Union
diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py
index 07a63e294..5820001b9 100644
--- a/vllm/reasoning/granite_reasoning_parser.py
+++ b/vllm/reasoning/granite_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from typing import Optional, Union
diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py
index 7095034b1..61bafc724 100644
--- a/vllm/reasoning/qwen3_reasoning_parser.py
+++ b/vllm/reasoning/qwen3_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from typing import Optional, Union
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 4294465f6..7abdcecca 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sampling parameters for text generation."""
 import copy
 from dataclasses import dataclass
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
index fc1761c84..9060b55c7 100644
--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import functools
 import struct
diff --git a/vllm/scripts.py b/vllm/scripts.py
index 7e569d2d2..7a7fdccf0 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.entrypoints.cli.main import main as vllm_main
 from vllm.logger import init_logger
diff --git a/vllm/sequence.py b/vllm/sequence.py
index d359f897d..ffe890eb2 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sequence and its related classes."""
 import copy
 import enum
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index e08ed742a..f9b882469 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from array import array
 from itertools import chain, count
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 991d2040a..8ccfefea1 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional
 
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index dd085ad77..70ec1590e 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py
index 0b62a988e..82b5a79fa 100644
--- a/vllm/spec_decode/medusa_worker.py
+++ b/vllm/spec_decode/medusa_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
 from typing import List, Optional, Set, Tuple
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index 4430da26c..a4784cad9 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from typing import Callable, Optional, Union
diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py
index bdaf31895..8e8c05d26 100644
--- a/vllm/spec_decode/mlp_speculator_worker.py
+++ b/vllm/spec_decode/mlp_speculator_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional, Set, Tuple
 
diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py
index 6275c460e..18e7b055a 100644
--- a/vllm/spec_decode/mqa_scorer.py
+++ b/vllm/spec_decode/mqa_scorer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.sequence import (ExecuteModelRequest, SequenceData,
                            SequenceGroupMetadata, get_all_seq_ids)
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index de57403d1..4a9bbe44d 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 import weakref
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index 57ae173af..7a1a0e56d 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
 from typing import List, Optional, Set, Tuple
diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py
index 2829d631b..fb44275aa 100644
--- a/vllm/spec_decode/proposer_worker_base.py
+++ b/vllm/spec_decode/proposer_worker_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import List, Optional, Set, Tuple
diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py
index ea3d91d78..91256cab6 100644
--- a/vllm/spec_decode/smaller_tp_proposer_worker.py
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional, Set, Tuple
 
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 252c80957..7dda1cbfe 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 from collections import defaultdict
diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py
index 08e773c56..ca89eb60a 100644
--- a/vllm/spec_decode/target_model_runner.py
+++ b/vllm/spec_decode/target_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional
 
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index b538923c0..afd91b42b 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional, Set, Tuple
 
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index 466269b21..22d2a4833 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from contextlib import contextmanager
diff --git a/vllm/test_utils.py b/vllm/test_utils.py
index f8cec380f..c6b126d00 100644
--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 MODELS_ON_S3 = [
     "adept/fuyu-8b",
     "ai21labs/AI21-Jamba-1.5-Mini",
diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py
index 7ed9ced0e..d215e5d8b 100644
--- a/vllm/third_party/pynvml.py
+++ b/vllm/third_party/pynvml.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # copied from https://pypi.org/project/nvidia-ml-py
 # version 12.570.86
 
diff --git a/vllm/tracing.py b/vllm/tracing.py
index 557ae40b8..6a287d82b 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from collections.abc import Mapping
diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py
index 84bd7a747..6d4231bac 100644
--- a/vllm/transformers_utils/__init__.py
+++ b/vllm/transformers_utils/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import envs
 
diff --git a/vllm/transformers_utils/chat_templates/__init__.py b/vllm/transformers_utils/chat_templates/__init__.py
index fe2bd3ca4..2783d12a2 100644
--- a/vllm/transformers_utils/chat_templates/__init__.py
+++ b/vllm/transformers_utils/chat_templates/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from .registry import get_chat_template_fallback_path
 
 __all__ = ["get_chat_template_fallback_path"]
diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py
index 853fed5d4..e0ef7f099 100644
--- a/vllm/transformers_utils/chat_templates/registry.py
+++ b/vllm/transformers_utils/chat_templates/registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 from typing import Callable, Optional, Union
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 8774f95a2..9bc3b8e09 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 import json
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index ed10c22c8..7edff455f 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.cohere2 import Cohere2Config
diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py
index 2261f0a9e..a789b93b5 100644
--- a/vllm/transformers_utils/configs/arctic.py
+++ b/vllm/transformers_utils/configs/arctic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 # ruff: noqa: E501
diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py
index 43e9503ff..7c5de3e94 100644
--- a/vllm/transformers_utils/configs/chatglm.py
+++ b/vllm/transformers_utils/configs/chatglm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/THUDM/ChatGLM2-6B
diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py
index 21328d767..e547a9c28 100644
--- a/vllm/transformers_utils/configs/cohere2.py
+++ b/vllm/transformers_utils/configs/cohere2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # ruff: noqa
 
diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py
index bffa127fe..7dbda99f8 100644
--- a/vllm/transformers_utils/configs/dbrx.py
+++ b/vllm/transformers_utils/configs/dbrx.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 # ruff: noqa: E501
diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py
index a54486fa4..957d63831 100644
--- a/vllm/transformers_utils/configs/deepseek_vl2.py
+++ b/vllm/transformers_utils/configs/deepseek_vl2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
 
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index a43e4746c..fb2e8a1df 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional, Union
diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py
index 25bafbb85..7450904a1 100644
--- a/vllm/transformers_utils/configs/exaone.py
+++ b/vllm/transformers_utils/configs/exaone.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copied from
 # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
diff --git a/vllm/transformers_utils/configs/falcon.py b/vllm/transformers_utils/configs/falcon.py
index f161a06f3..2f5400463 100644
--- a/vllm/transformers_utils/configs/falcon.py
+++ b/vllm/transformers_utils/configs/falcon.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
diff --git a/vllm/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py
index 48b5d79ff..b36a6dd59 100644
--- a/vllm/transformers_utils/configs/h2ovl.py
+++ b/vllm/transformers_utils/configs/h2ovl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
diff --git a/vllm/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py
index 8ea62546e..4494ebfef 100644
--- a/vllm/transformers_utils/configs/internvl.py
+++ b/vllm/transformers_utils/configs/internvl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py
diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py
index b947c6a9e..767c4ddae 100644
--- a/vllm/transformers_utils/configs/jais.py
+++ b/vllm/transformers_utils/configs/jais.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
diff --git a/vllm/transformers_utils/configs/kimi_vl.py b/vllm/transformers_utils/configs/kimi_vl.py
index 97ff44bb9..ae8dac0f3 100644
--- a/vllm/transformers_utils/configs/kimi_vl.py
+++ b/vllm/transformers_utils/configs/kimi_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
 from typing import Optional, Union
 
diff --git a/vllm/transformers_utils/configs/medusa.py b/vllm/transformers_utils/configs/medusa.py
index 885713c5d..9ba52956a 100644
--- a/vllm/transformers_utils/configs/medusa.py
+++ b/vllm/transformers_utils/configs/medusa.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional, Union
diff --git a/vllm/transformers_utils/configs/minimax_text_01.py b/vllm/transformers_utils/configs/minimax_text_01.py
index 660e870ac..e3b63dfa0 100644
--- a/vllm/transformers_utils/configs/minimax_text_01.py
+++ b/vllm/transformers_utils/configs/minimax_text_01.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """ MiniMaxText01 model configuration"""
 
 from transformers.configuration_utils import PretrainedConfig
diff --git a/vllm/transformers_utils/configs/minimax_vl_01.py b/vllm/transformers_utils/configs/minimax_vl_01.py
index 99e0d249d..c62497192 100644
--- a/vllm/transformers_utils/configs/minimax_vl_01.py
+++ b/vllm/transformers_utils/configs/minimax_vl_01.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """MiniMaxVL01 model configuration"""
 
 from transformers.configuration_utils import PretrainedConfig
diff --git a/vllm/transformers_utils/configs/mllama.py b/vllm/transformers_utils/configs/mllama.py
index eb77e09ad..f0cd2d52a 100644
--- a/vllm/transformers_utils/configs/mllama.py
+++ b/vllm/transformers_utils/configs/mllama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from transformers.models.mllama import configuration_mllama as mllama_hf_config
 
diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py
index 70f607529..2fa284e5c 100644
--- a/vllm/transformers_utils/configs/mlp_speculator.py
+++ b/vllm/transformers_utils/configs/mlp_speculator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/transformers_utils/configs/moonvit.py b/vllm/transformers_utils/configs/moonvit.py
index a2b4059a6..a6f712f3d 100644
--- a/vllm/transformers_utils/configs/moonvit.py
+++ b/vllm/transformers_utils/configs/moonvit.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
 from transformers.configuration_utils import PretrainedConfig
 
diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py
index 2d52658d3..91316408d 100644
--- a/vllm/transformers_utils/configs/mpt.py
+++ b/vllm/transformers_utils/configs/mpt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copied from
 # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index fdf4fa2a5..d65b572dc 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py
index 300f6e211..a533720af 100644
--- a/vllm/transformers_utils/configs/nvlm_d.py
+++ b/vllm/transformers_utils/configs/nvlm_d.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py
index 0ec224214..c2728f0ed 100644
--- a/vllm/transformers_utils/configs/ovis.py
+++ b/vllm/transformers_utils/configs/ovis.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 # ruff: noqa: E501
diff --git a/vllm/transformers_utils/configs/skyworkr1v.py b/vllm/transformers_utils/configs/skyworkr1v.py
index ef5f9ba85..33a45220e 100644
--- a/vllm/transformers_utils/configs/skyworkr1v.py
+++ b/vllm/transformers_utils/configs/skyworkr1v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/configuration_skywork_chat.py
diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py
index 6eaf699d1..a83dfa40b 100644
--- a/vllm/transformers_utils/configs/solar.py
+++ b/vllm/transformers_utils/configs/solar.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py
index 5da6c5b44..050a7851d 100644
--- a/vllm/transformers_utils/configs/telechat2.py
+++ b/vllm/transformers_utils/configs/telechat2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py
 """ Telechat configuration compatible with LlamaConfig. """
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index 4c5072427..62f63b02d 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
 from typing import Any, Optional
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index 3adf2e32c..380c62a14 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 7373fa0ed..342632989 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index ce6427de4..70cd08263 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import lru_cache
 from typing import TYPE_CHECKING, Any, Optional, Union, cast
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index 2bd9ab1f0..14d15f2bc 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.transformers_utils.processors.deepseek_vl2 import (
     DeepseekVLV2Processor)
diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py
index df960e9c7..b4669d12f 100644
--- a/vllm/transformers_utils/processors/deepseek_vl2.py
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 # ruff: noqa: E501
diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py
index f1c6407e1..4fe76d0df 100644
--- a/vllm/transformers_utils/processors/ovis.py
+++ b/vllm/transformers_utils/processors/ovis.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 # ruff: noqa: E501
diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index 1c3520bcf..f95aae781 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import fnmatch
 import os
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index fa7a208c4..ae96ebe4e 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import copy
diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py
index d69e5a6b4..20e5fea71 100644
--- a/vllm/transformers_utils/tokenizer_base.py
+++ b/vllm/transformers_utils/tokenizer_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import importlib
 from abc import ABC, abstractmethod
diff --git a/vllm/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py
index 8b9e4881e..eb53cceaa 100644
--- a/vllm/transformers_utils/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
index 7aac29a6b..941156c4b 100644
--- a/vllm/transformers_utils/tokenizers/__init__.py
+++ b/vllm/transformers_utils/tokenizers/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
                       truncate_tool_call_ids, validate_request_params)
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 23b6f67f0..fcc0f538f 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from dataclasses import dataclass
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 8dff1b612..66c8fb797 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from functools import cache
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 9f14a907a..0fcf5d15a 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.triton_utils.importing import (HAS_TRITON, TritonLanguagePlaceholder,
                                          TritonPlaceholder)
diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
index 8cf2e01a3..068fa3031 100644
--- a/vllm/triton_utils/importing.py
+++ b/vllm/triton_utils/importing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import types
 from importlib.util import find_spec
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 90af0c63c..c14963763 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import datetime
 import json
diff --git a/vllm/utils.py b/vllm/utils.py
index b4152e6b2..41336b80e 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 9ed3dec7f..9e989df1c 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 1c4f7f62f..8bd998eba 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashInfer."""
 from __future__ import annotations
 
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 1edfab26b..96befca5a 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 # MLA Common Components
 
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index e6594c6b6..060a7c9d8 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any, Optional
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index d1e823bbe..8925b5a5c 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any, Optional
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index 2e6b619db..0857fc133 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 8187e457d..896f1394c 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any, Optional
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index a97bb8500..6a3314dd8 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with PagedAttention and Triton prefix prefill."""
 from typing import TYPE_CHECKING, Any, Optional
 
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 10a771e83..2e65619ed 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
 import torch
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index a0a065df9..27eaca497 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import defaultdict
 from collections.abc import Iterable
 from typing import Callable, Optional
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 05d70bb9b..16dc67b9b 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING
 
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 59e07382b..91999d300 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import defaultdict
 from dataclasses import dataclass
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 3ccad97e9..61476362e 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """KV-Cache Utilities."""
 import os
 from collections import deque
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index 055ce4460..dd5052a34 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Optional, Union
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index 257234430..b404c70eb 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index ce16a1ed5..e510a0626 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py
index 3a0028a59..1397c5f4c 100644
--- a/vllm/v1/core/sched/utils.py
+++ b/vllm/v1/core/sched/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.v1.request import Request, RequestStatus
 
 
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index e69e9ac9f..233c73e88 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from typing import Callable
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 0c9f61a76..d1bec2523 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 import time
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 4b235c596..0e3696321 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 from collections.abc import AsyncGenerator, Mapping
 from copy import copy
diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py
index b84d4b144..4f6ba099c 100644
--- a/vllm/v1/engine/coordinator.py
+++ b/vllm/v1/engine/coordinator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import multiprocessing
 import time
 import weakref
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 7253d1dc6..f36a491a1 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import queue
 import signal
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index fa01998aa..adb0709c8 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import contextlib
 import queue
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index dca327cc5..c6fe2d339 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from typing import Optional
 
diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
index 97dd31d5e..692ba9dc8 100644
--- a/vllm/v1/engine/exceptions.py
+++ b/vllm/v1/engine/exceptions.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 class EngineGenerateError(Exception):
     """Raised when a AsyncLLM.generate() fails. Recoverable."""
     pass
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index c856e2645..736ffd8b4 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Mapping
 from copy import copy
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 03d82b6bb..edc3be5b0 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from collections.abc import Iterable
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index 45fb5cd23..abe98a13d 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
 from typing import Optional
 
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 293c291b4..1dcfbab30 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from collections.abc import Iterable
diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py
index 4df7ca597..1e9911152 100644
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import copy
 from typing import Optional
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 64a756148..5c0d01d9b 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from collections.abc import Mapping, Sequence
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 3b9feb0d3..50b9634a4 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from concurrent.futures import Future
 from typing import Callable, Union
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index eb5f9d4bf..0bd7383b5 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import multiprocessing
 import os
 import pickle
diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py
index 320ebfd37..257564793 100644
--- a/vllm/v1/executor/ray_distributed_executor.py
+++ b/vllm/v1/executor/ray_distributed_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from concurrent.futures import Future
 from typing import Union
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 2747fc7fa..cf2eb3b95 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 from dataclasses import dataclass
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 665e5873d..2d621ec31 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 import time
diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py
index a364b286d..61ba5d66c 100644
--- a/vllm/v1/metrics/prometheus.py
+++ b/vllm/v1/metrics/prometheus.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import tempfile
diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py
index a51c3ed7f..18c8dcf0a 100644
--- a/vllm/v1/metrics/ray_wrappers.py
+++ b/vllm/v1/metrics/ray_wrappers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import time
 from typing import Optional, Union
 
diff --git a/vllm/v1/metrics/reader.py b/vllm/v1/metrics/reader.py
index 5ab78129a..4d6e59984 100644
--- a/vllm/v1/metrics/reader.py
+++ b/vllm/v1/metrics/reader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Optional
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 8fe163061..50c8b07fe 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from dataclasses import dataclass, field
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index e8ce0df5e..17a299d57 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import NamedTuple, Optional
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 42c75ef96..53fd70fab 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 from typing import TYPE_CHECKING, Any, Optional, Union
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index e97e1235f..ab13b288a 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Optional
diff --git a/vllm/v1/sample/ops/bad_words.py b/vllm/v1/sample/ops/bad_words.py
index 2984d4e48..1b699565f 100644
--- a/vllm/v1/sample/ops/bad_words.py
+++ b/vllm/v1/sample/ops/bad_words.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py
index ed05e3f48..48423b9b4 100644
--- a/vllm/v1/sample/ops/penalties.py
+++ b/vllm/v1/sample/ops/penalties.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 4a5fbb10d..30396f159 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 17b870fed..b2354c533 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 16561d30a..8ba3c2087 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A layer that samples the next tokens from the model's outputs."""
 
 import torch
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index a1c7dcdb1..4c1ac4895 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass, field
 from typing import Optional
 
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
index 7c31a2984..1056eb1d7 100644
--- a/vllm/v1/sample/tpu/sampler.py
+++ b/vllm/v1/sample/tpu/sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sampler layer implementing TPU supported operations."""
 
 import torch
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 78f37c1e8..ab6653a78 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import pickle
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 1ca856423..416bc8af1 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 import torch.nn as nn
 
diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py
index fdac2ef64..f516bf486 100644
--- a/vllm/v1/spec_decode/medusa.py
+++ b/vllm/v1/spec_decode/medusa.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.nn as nn
diff --git a/vllm/v1/spec_decode/metadata.py b/vllm/v1/spec_decode/metadata.py
index 1cf650d5f..b1efb4061 100644
--- a/vllm/v1/spec_decode/metadata.py
+++ b/vllm/v1/spec_decode/metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
 import numpy as np
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
index 36091bef2..b4bc3058c 100644
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass, field
 from typing import Optional
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 704153d43..6b90d0970 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import numpy as np
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 334258e7f..5c37333ce 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.triton_utils import tl, triton
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 07b422814..b2b0ee796 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import multiprocessing
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 55c5f6090..02e7fc33f 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
index 09f6cdf73..d500783aa 100644
--- a/vllm/v1/structured_output/backend_types.py
+++ b/vllm/v1/structured_output/backend_types.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index f2570221d..88544565e 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
index 9a7e30d41..fc365f125 100644
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import dataclasses
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index 111e92dc0..7adee7237 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index d347efc42..5b497e66c 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import multiprocessing
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 576086ebe..958262c49 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import numpy as np
 import torch
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index b3e65917d..bb986b604 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Datastructures defining an input batch
 
 from dataclasses import dataclass
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9f7c474c7..c96ad0c01 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 import gc
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index dd06e7296..f36cf5d5c 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A GPU worker class."""
 import gc
 import os
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index eb8ed6221..afa41a37e 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Define LoRA functionality mixin for model runners.
 """
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index c5171b973..48ea3cb7b 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import bisect
 import gc
 import time
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index bf0a5777c..8d2f8112d 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A TPU worker class."""
 import os
 from typing import Optional
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 91548a52c..b23b28c1d 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index 487a49b62..9c93754f9 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/version.py b/vllm/version.py
index 8329d7bec..6c88b1b5a 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 try:
     from ._version import __version__, __version_tuple__
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index d48a6957c..530907012 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """CacheEngine class for managing the KV cache."""
 from typing import List
 
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index 82eeeb570..677d66357 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index fb436a079..6213cf760 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import weakref
diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py
index 2a60e5126..174f86f48 100644
--- a/vllm/worker/cpu_pooling_model_runner.py
+++ b/vllm/worker/cpu_pooling_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 1436a4043..b04a9a1eb 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A CPU worker class."""
 import os
 from typing import Dict, List, Optional, Set, Tuple, Type
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 3957e5608..a3e7b0147 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import itertools
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index e2261cbb2..17123d2b4 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 533fead0e..6d76ea499 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 8c968faa7..75501e0f7 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import gc
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 935325cb2..d567ce4a6 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from abc import ABC, abstractmethod
diff --git a/vllm/worker/multi_step_hpu_worker.py b/vllm/worker/multi_step_hpu_worker.py
index 2c5e2eac7..f0210c13c 100644
--- a/vllm/worker/multi_step_hpu_worker.py
+++ b/vllm/worker/multi_step_hpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 ###############################################################################
 # Copyright (C) 2025 Habana Labs, Ltd. an Intel Company
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index f8d5acf58..cc0cc855e 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import functools
diff --git a/vllm/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py
index aafb7ab7c..336e41649 100644
--- a/vllm/worker/multi_step_neuron_model_runner.py
+++ b/vllm/worker/multi_step_neuron_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from importlib.util import find_spec
 from typing import List, Optional
diff --git a/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py
index 3a9c0993e..de9827723 100644
--- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py
+++ b/vllm/worker/multi_step_neuronx_distributed_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import List, Optional
 
 import torch
diff --git a/vllm/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py
index 387119998..ed9f00166 100644
--- a/vllm/worker/multi_step_tpu_worker.py
+++ b/vllm/worker/multi_step_tpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from typing import Dict, Optional, Tuple
diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py
index 3518ab2f6..ea16e14f9 100644
--- a/vllm/worker/multi_step_worker.py
+++ b/vllm/worker/multi_step_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from dataclasses import dataclass
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 3aff3e01a..28855bb46 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from dataclasses import dataclass
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 64daee31b..662bde6bc 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A Neuron worker class."""
 import os
 from typing import List, Optional, Set, Tuple
diff --git a/vllm/worker/neuronx_distributed_model_runner.py b/vllm/worker/neuronx_distributed_model_runner.py
index 9cd4f88d3..2a0f4e77c 100644
--- a/vllm/worker/neuronx_distributed_model_runner.py
+++ b/vllm/worker/neuronx_distributed_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional, Set
 
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index 912e04c43..be6b3d137 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index e0cca9072..5f1535271 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 import time
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 4bb9bea02..ad5ed19e2 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import List, Optional, Tuple, Union
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
index e2854bcb3..1a5f62cb3 100644
--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 '''
 Worker-related helper functions.
 '''
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 2a4317271..9a9286326 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A GPU worker class."""
 import gc
 import os
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index e5662e693..db1ca2d8f 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import os
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 79fa7d2c7..ecbb63d91 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import time
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index a5109a982..fe321c059 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A XPU worker class."""
 import gc
 import os
-- 
GitLab


From 19bdaf32b139656627c8b311361a0fa38ae98f4b Mon Sep 17 00:00:00 2001
From: SorenDreano <71752785+SorenDreano@users.noreply.github.com>
Date: Tue, 3 Jun 2025 20:50:55 +0200
Subject: [PATCH 141/274] [Doc] Readme standardization (#18695)

Co-authored-by: Soren Dreano <soren@numind.ai>
---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 67f6b957e..ec16d7583 100644
--- a/README.md
+++ b/README.md
@@ -58,8 +58,8 @@ vLLM is fast with:
 - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
-- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
-- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
+- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8
+- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer
 - Speculative decoding
 - Chunked prefill
 
@@ -72,14 +72,14 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism and pipeline parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
 - Prefix caching support
 - Multi-LoRA support
 
 vLLM seamlessly supports most popular open-source models on HuggingFace, including:
 - Transformer-like LLMs (e.g., Llama)
 - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
-- Embedding Models (e.g. E5-Mistral)
+- Embedding Models (e.g., E5-Mistral)
 - Multi-modal LLMs (e.g., LLaVA)
 
 Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
@@ -162,4 +162,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 
 ## Media Kit
 
-- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
+- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)
-- 
GitLab


From 01eee4053606458b2596818acd1fffee699ed75d Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Wed, 4 Jun 2025 03:08:21 +0800
Subject: [PATCH 142/274] [doc] update docker version (#19074)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/deployment/docker.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index 9e506d3d7..93d9e80f5 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -46,11 +46,11 @@ You can add any other [engine-args][engine-args] you need after the image tag (`
     create a custom Dockerfile on top of the base image with an extra layer that installs them:
 
     ```Dockerfile
-    FROM vllm/vllm-openai:v0.8.3
+    FROM vllm/vllm-openai:v0.9.0
 
     # e.g. install the `audio` optional dependencies
     # NOTE: Make sure the version of vLLM matches the base image!
-    RUN uv pip install --system vllm[audio]==0.8.3
+    RUN uv pip install --system vllm[audio]==0.9.0
     ```
 
 !!! tip
-- 
GitLab


From fa98d77773c649de05a4bda9847682c80287aa36 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Tue, 3 Jun 2025 15:30:02 -0400
Subject: [PATCH 143/274] [Kernel] DeepEP dispatch-combine kernel integration
 (#18434)

Signed-off-by: Varun <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
---
 csrc/moe/topk_softmax_kernels.cu              |  16 +-
 tests/kernels/moe/__init__.py                 |   0
 tests/kernels/moe/deepep_utils.py             | 188 +++++++
 tests/kernels/moe/test_deepep_deepgemm_moe.py | 371 ++++++++++++++
 tests/kernels/moe/test_deepep_moe.py          | 459 ++++++++++++++++++
 vllm/config.py                                |   2 +
 .../device_communicators/all2all.py           | 146 +++++-
 .../device_communicators/cuda_communicator.py |   8 +
 vllm/envs.py                                  |   2 +
 .../layers/fused_moe/deep_gemm_moe.py         |  32 +-
 .../fused_moe/deepep_ht_prepare_finalize.py   | 236 +++++++++
 .../fused_moe/deepep_ll_prepare_finalize.py   | 152 ++++++
 .../layers/fused_moe/fused_batched_moe.py     |  57 ++-
 .../layers/fused_moe/fused_moe.py             |   2 +-
 vllm/model_executor/layers/fused_moe/layer.py | 148 ++++--
 .../layers/fused_moe/modular_kernel.py        | 162 +++++--
 .../layers/fused_moe/moe_permute_unpermute.py |   5 +-
 .../layers/fused_moe/pplx_prepare_finalize.py |  11 +-
 .../layers/fused_moe/prepare_finalize.py      |  12 +-
 .../layers/fused_moe/triton_deep_gemm_moe.py  |   7 +-
 vllm/model_executor/layers/fused_moe/utils.py |   4 +-
 .../model_executor/layers/quantization/fp8.py |  41 +-
 vllm/platforms/cuda.py                        |  15 +
 23 files changed, 1952 insertions(+), 124 deletions(-)
 create mode 100644 tests/kernels/moe/__init__.py
 create mode 100644 tests/kernels/moe/deepep_utils.py
 create mode 100644 tests/kernels/moe/test_deepep_deepgemm_moe.py
 create mode 100644 tests/kernels/moe/test_deepep_moe.py
 create mode 100644 vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
 create mode 100644 vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py

diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index a93790322..10be47966 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -516,9 +516,8 @@ void topk_softmax(
             topk,
             stream);
     }
-    else
+    else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
     {
-        assert(topk_indices.scalar_type() == at::ScalarType::UInt32);
         vllm::moe::topkGatingSoftmaxKernelLauncher(
             gating_output.data_ptr<float>(),
             topk_weights.data_ptr<float>(),
@@ -530,4 +529,17 @@ void topk_softmax(
             topk,
             stream);
     }
+    else {
+        assert(topk_indices.scalar_type() == at::ScalarType::Int64);
+        vllm::moe::topkGatingSoftmaxKernelLauncher(
+            gating_output.data_ptr<float>(),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<int64_t>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens,
+            num_experts,
+            topk,
+            stream);
+    }
 }
diff --git a/tests/kernels/moe/__init__.py b/tests/kernels/moe/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/kernels/moe/deepep_utils.py b/tests/kernels/moe/deepep_utils.py
new file mode 100644
index 000000000..2bc9b657d
--- /dev/null
+++ b/tests/kernels/moe/deepep_utils.py
@@ -0,0 +1,188 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+DeepEP test utilities
+"""
+import dataclasses
+import importlib
+import traceback
+from typing import Callable, Optional
+
+import torch
+from torch.distributed import ProcessGroup
+from torch.multiprocessing import (
+    spawn)  # pyright: ignore[reportPrivateImportUsage]
+from typing_extensions import Concatenate, ParamSpec
+
+has_deep_ep = importlib.util.find_spec("deep_ep") is not None
+if has_deep_ep:
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
+        DeepEPHTPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
+        DeepEPLLPrepareAndFinalize)
+
+## Parallel Processes Utils
+
+P = ParamSpec("P")
+
+
+@dataclasses.dataclass
+class ProcessGroupInfo:
+    world_size: int
+    world_local_size: int
+    rank: int
+    node_rank: int
+    local_rank: int
+    device: torch.device
+
+
+def _worker_parallel_launch(
+    local_rank: int,
+    world_size: int,
+    world_local_size: int,
+    node_rank: int,
+    init_method: str,
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    rank = node_rank * world_local_size + local_rank
+    torch.cuda.set_device(local_rank)
+    device = torch.device("cuda", local_rank)
+    torch.distributed.init_process_group(
+        backend="cpu:gloo,cuda:nccl",
+        init_method=init_method,
+        rank=rank,
+        world_size=world_size,
+        device_id=device,
+    )
+    barrier = torch.tensor([rank], device=device)
+    torch.distributed.all_reduce(barrier)
+
+    try:
+        worker(
+            ProcessGroupInfo(
+                world_size=world_size,
+                world_local_size=world_local_size,
+                rank=rank,
+                node_rank=node_rank,
+                local_rank=local_rank,
+                device=device,
+            ),
+            *args,
+            **kwargs,
+        )
+    except Exception as ex:
+        print(ex)
+        traceback.print_exc()
+        raise
+    finally:
+        torch.distributed.destroy_process_group()
+
+
+def parallel_launch(
+    world_size: int,
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    assert not kwargs
+    spawn(
+        _worker_parallel_launch,
+        args=(
+            world_size,
+            world_size,
+            0,
+            "tcp://localhost:29500",
+            worker,
+        ) + args,
+        nprocs=world_size,
+        join=True,
+    )
+
+
+## DeepEP specific utils
+
+
+@dataclasses.dataclass
+class DeepEPHTArgs:
+    num_local_experts: int
+
+
+@dataclasses.dataclass
+class DeepEPLLArgs:
+    max_tokens_per_rank: int
+    hidden_size: int
+    num_experts: int
+    use_fp8_dispatch: bool
+
+
+def make_deepep_ht_a2a(pg: ProcessGroup,
+                       pgi: ProcessGroupInfo,
+                       dp_size: int,
+                       ht_args: DeepEPHTArgs,
+                       q_dtype: Optional[torch.dtype] = None,
+                       block_shape: Optional[list[int]] = None):
+
+    import deep_ep
+
+    # high throughput a2a
+    num_nvl_bytes = 1024 * 1024 * 1024  # 1GB
+    num_rdma_bytes, low_latency_mode, num_qps_per_rank = 0, False, 1
+    buffer = deep_ep.Buffer(group=pg,
+                            num_nvl_bytes=num_nvl_bytes,
+                            num_rdma_bytes=num_rdma_bytes,
+                            low_latency_mode=low_latency_mode,
+                            num_qps_per_rank=num_qps_per_rank)
+    return DeepEPHTPrepareAndFinalize(buffer=buffer,
+                                      world_size=pgi.world_size,
+                                      rank=pgi.rank,
+                                      dp_size=dp_size,
+                                      rank_expert_offset=pgi.rank *
+                                      ht_args.num_local_experts,
+                                      quant_dtype=q_dtype,
+                                      block_shape=block_shape)
+
+
+def make_deepep_ll_a2a(pg: ProcessGroup,
+                       pgi: ProcessGroupInfo,
+                       dp_size: int,
+                       deepep_ll_args: DeepEPLLArgs,
+                       q_dtype: Optional[torch.dtype] = None,
+                       block_shape: Optional[list[int]] = None):
+
+    import deep_ep
+
+    # low-latency a2a
+    num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
+        deepep_ll_args.max_tokens_per_rank, deepep_ll_args.hidden_size,
+        pgi.world_size, deepep_ll_args.num_experts)
+
+    buffer = deep_ep.Buffer(group=pg,
+                            num_rdma_bytes=num_rdma_bytes,
+                            low_latency_mode=True,
+                            num_qps_per_rank=deepep_ll_args.num_experts //
+                            pgi.world_size)
+    return DeepEPLLPrepareAndFinalize(
+        buffer=buffer,
+        world_size=pgi.world_size,
+        dp_size=dp_size,
+        max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank,
+        quant_dtype=q_dtype,
+        use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch,
+    )
+
+
+def make_deepep_a2a(pg: ProcessGroup,
+                    pgi: ProcessGroupInfo,
+                    dp_size: int,
+                    deepep_ht_args: Optional[DeepEPHTArgs],
+                    deepep_ll_args: Optional[DeepEPLLArgs],
+                    q_dtype: Optional[torch.dtype] = None,
+                    block_shape: Optional[list[int]] = None):
+    if deepep_ht_args is not None:
+        assert deepep_ll_args is None
+        return make_deepep_ht_a2a(pg, pgi, dp_size, deepep_ht_args, q_dtype,
+                                  block_shape)
+
+    assert deepep_ll_args is not None
+    return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype)
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
new file mode 100644
index 000000000..a1fdc1d5f
--- /dev/null
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -0,0 +1,371 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Test DeepEP + DeepGEMM integration 
+"""
+
+import dataclasses
+import importlib
+from typing import Optional
+
+import pytest
+import torch.distributed
+from torch.distributed import ProcessGroup
+from typing_extensions import ParamSpec
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.platforms import current_platform
+
+from .deepep_utils import ProcessGroupInfo, parallel_launch
+
+has_deep_ep = importlib.util.find_spec("deep_ep") is not None
+
+try:
+    import deep_gemm
+    has_deep_gemm = True
+except ImportError:
+    has_deep_gemm = False
+
+if has_deep_ep:
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
+        DeepEPHTPrepareAndFinalize)
+
+    from .deepep_utils import DeepEPHTArgs, make_deepep_a2a
+
+if has_deep_gemm:
+    from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+        DeepGemmExperts)
+
+requires_deep_ep = pytest.mark.skipif(
+    not has_deep_ep,
+    reason="Requires deep_ep kernels",
+)
+
+requires_deep_gemm = pytest.mark.skipif(
+    not has_deep_gemm,
+    reason="Requires deep_gemm kernels",
+)
+
+P = ParamSpec("P")
+
+
+def per_block_cast_to_fp8(
+        x: torch.Tensor,
+        block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (deep_gemm.ceil_div(m, 128) * 128,
+         deep_gemm.ceil_div(n, block_size_n) * block_size_n),
+        dtype=x.dtype,
+        device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
+    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+    return x_scaled_sub, scales
+
+
+def make_block_quant_fp8_weights(
+    e: int,
+    n: int,
+    k: int,
+    block_size: list[int],
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Return weights w1, w2, w1q, w2q, w1_scale, w2_scale
+    """
+    dtype = torch.bfloat16
+
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    w1_bf16 = torch.randn((e, 2 * n, k), dtype=dtype) / 10
+    w1_bf16 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
+
+    w2_bf16 = torch.randn((e, k, n), dtype=dtype) / 10
+    w2_bf16 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles_w1 = ((2 * n) + block_n - 1) // block_n
+    k_tiles_w1 = (k + block_k - 1) // block_k
+    n_tiles_w2 = (k + block_n - 1) // block_n
+    k_tiles_w2 = (n + block_k - 1) // block_k
+
+    w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn)
+    w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn)
+
+    w1_s = torch.empty((e, n_tiles_w1, k_tiles_w1),
+                       device="cuda",
+                       dtype=torch.float32)
+    w2_s = torch.empty((e, n_tiles_w2, k_tiles_w2),
+                       device="cuda",
+                       dtype=torch.float32)
+
+    assert w1_s.shape == (e, (2 * n + 127) // 128, (k + 127) // 128)
+    assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
+
+    for i in range(e):
+        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
+        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
+
+    return w1, w2, w1_s, w2_s
+
+
+@dataclasses.dataclass
+class TestConfig:
+    topk: int
+    m: int
+    k: int
+    n: int
+    num_experts: int
+    block_size: list[int]
+
+
+@dataclasses.dataclass
+class TestTensors:
+    rank_tokens: torch.Tensor  # all ranks make this many tokens
+    rank_token_scales: Optional[torch.Tensor]
+    topk: torch.Tensor
+    topk_weights: torch.Tensor
+    config: TestConfig
+
+    @staticmethod
+    def make(config: TestConfig, rank) -> "TestTensors":
+
+        dtype = torch.bfloat16
+        topk, m, k, block_size = (config.topk, config.m, config.k,
+                                  config.block_size)
+
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+        rank_tokens = torch.randn(
+            (m, k), device=torch.cuda.current_device(), dtype=dtype) / 10.0
+        rank_tokens = rank_tokens.clamp(min=fp8_min, max=fp8_max)
+
+        block_k = block_size[1]
+        _, rank_token_scales = per_token_group_quant_fp8(rank_tokens, block_k)
+
+        topk_ids = torch.randint(
+            low=0,
+            high=config.num_experts,
+            size=(m, topk),
+            device=torch.cuda.current_device()).to(dtype=torch.int64)
+
+        topk_weights = torch.randn(topk_ids.shape,
+                                   dtype=torch.float32,
+                                   device=torch.cuda.current_device())
+
+        return TestTensors(rank_tokens=rank_tokens,
+                           rank_token_scales=rank_token_scales,
+                           topk=topk_ids,
+                           topk_weights=topk_weights,
+                           config=config)
+
+
+def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int,
+                        num_local_experts: int, q_dtype: Optional[torch.dtype],
+                        block_shape: list[int]) -> FusedMoEModularKernel:
+
+    a2a: DeepEPHTPrepareAndFinalize = make_deepep_a2a(
+        pg=pg,
+        pgi=pgi,
+        dp_size=dp_size,
+        deepep_ht_args=DeepEPHTArgs(num_local_experts=num_local_experts),
+        deepep_ll_args=None,
+        q_dtype=q_dtype,
+        block_shape=block_shape)
+
+    fused_experts = DeepGemmExperts()
+    mk = FusedMoEModularKernel(prepare_finalize=a2a,
+                               fused_experts=fused_experts)
+    return mk
+
+
+def deep_ep_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int,
+                     test_tensors: TestTensors, w1: torch.Tensor,
+                     w2: torch.Tensor, w1_scale: Optional[torch.Tensor],
+                     w2_scale: Optional[torch.Tensor],
+                     num_experts: int) -> torch.Tensor:
+
+    num_local_experts = w1.size(0)
+
+    def build_expert_map():
+        num_local_experts = w1.size(0)
+        expert_map = torch.full((num_experts, ),
+                                fill_value=-1,
+                                dtype=torch.int32)
+        s = pgi.rank * num_local_experts
+        e = s + num_local_experts
+        expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
+        return expert_map.to(device=torch.cuda.current_device(),
+                             dtype=torch.int32)
+
+    q_dtype = torch.float8_e4m3fn
+
+    # Make modular kernel
+    mk: FusedMoEModularKernel = make_modular_kernel(
+        pg, pgi, dp_size, num_local_experts, q_dtype,
+        test_tensors.config.block_size)
+
+    a1_scale = test_tensors.rank_token_scales
+
+    out = mk.forward(hidden_states=test_tensors.rank_tokens,
+                     w1=w1,
+                     w2=w2,
+                     topk_weights=test_tensors.topk_weights,
+                     topk_ids=test_tensors.topk,
+                     inplace=False,
+                     activation="silu",
+                     global_num_experts=num_experts,
+                     expert_map=build_expert_map(),
+                     w1_scale=w1_scale,
+                     w2_scale=w2_scale,
+                     w1_zp=None,
+                     w2_zp=None,
+                     a1_scale=a1_scale,
+                     a2_scale=None,
+                     apply_router_weight_on_input=False)
+    return out
+
+
+def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor,
+                topk_weights: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
+                w1_scale: torch.Tensor, w2_scale: torch.Tensor,
+                a1_scale: torch.Tensor, block_shape: list[int]):
+
+    return fused_experts(
+        hidden_states=a,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=False,
+        use_fp8_w8a8=True,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        block_shape=block_shape,
+        # Make sure this is set to False so we
+        # dont end up comparing the same implementation.
+        allow_deep_gemm=False)
+
+
+def _deep_ep_moe(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    config: TestConfig,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+):
+    current_platform.seed_everything(pgi.rank)
+
+    w1 = w1.to(device=torch.cuda.current_device())
+    w2 = w2.to(device=torch.cuda.current_device())
+    w1_scale = w1_scale.to(device=torch.cuda.current_device())
+    w2_scale = w2_scale.to(device=torch.cuda.current_device())
+
+    pg = torch.distributed.new_group(list(range(pgi.world_size)))
+    test_tensors = TestTensors.make(config, pgi.rank)
+    block_shape = [
+        w1.size(1) // w1_scale.size(1),
+        w1.size(2) // w1_scale.size(2)
+    ]
+
+    with set_current_vllm_config(VllmConfig()):
+        # Reference
+        triton_moe = triton_impl(a=test_tensors.rank_tokens,
+                                 topk_ids=test_tensors.topk,
+                                 topk_weights=test_tensors.topk_weights,
+                                 w1=w1,
+                                 w2=w2,
+                                 w1_scale=w1_scale,
+                                 w2_scale=w2_scale,
+                                 a1_scale=test_tensors.rank_token_scales,
+                                 block_shape=block_shape)
+
+        # Slice experts for this rank.
+        num_local_experts = config.num_experts // pgi.world_size
+        e_start = num_local_experts * pgi.rank
+        e_end = e_start + num_local_experts
+        w1_ep = w1[e_start:e_end]
+        w2_ep = w2[e_start:e_end]
+        w1_scale_ep = w1_scale[e_start:e_end]
+        w2_scale_ep = w2_scale[e_start:e_end]
+
+        deepep_moe = deep_ep_moe_impl(
+            pg,
+            pgi,
+            dp_size,
+            test_tensors,
+            w1_ep,
+            w2_ep,
+            w1_scale_ep,
+            w2_scale_ep,
+            config.num_experts,
+        )
+
+    torch.testing.assert_close(
+        triton_moe,
+        deepep_moe,
+        atol=6e-2,
+        rtol=6e-2,
+    )
+
+
+MNKs = [
+    (8, 128, 128),
+    (8, 128, 512),
+    (8, 512, 512),
+    (3, 1024, 2048),
+    (32, 128, 1024),
+    (45, 512, 2048),
+    (64, 1024, 1024),
+    (129, 128, 256),
+    (129, 1024, 2048),
+    (222, 1024, 2048),
+]
+
+
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("num_experts", [32])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@requires_deep_ep
+@requires_deep_gemm
+def test_deep_ep_moe(mnk: tuple[int, int, int], num_experts: int, topk: int,
+                     world_dp_size: tuple[int, int]):
+
+    m, n, k = mnk
+    current_platform.seed_everything(7)
+
+    if topk > num_experts:
+        pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
+
+    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+    block_size = [block_m, block_m]
+
+    world_size, dp_size = world_dp_size
+    config = TestConfig(
+        topk=topk,
+        m=m,
+        k=k,
+        n=n,
+        num_experts=num_experts,
+        block_size=block_size,
+    )
+
+    w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
+        num_experts, n, k, block_size)
+
+    parallel_launch(world_size, _deep_ep_moe, dp_size, config, w1, w2,
+                    w1_scale, w2_scale)
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
new file mode 100644
index 000000000..7e029ea95
--- /dev/null
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -0,0 +1,459 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Test deepep dispatch-combine logic
+"""
+
+import dataclasses
+import importlib
+from typing import Optional, Union
+
+import pytest
+import torch.distributed
+from torch.distributed import ProcessGroup
+
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import TritonExperts
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedTritonExperts)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.platforms import current_platform
+
+from .deepep_utils import ProcessGroupInfo, parallel_launch
+
+has_deep_ep = importlib.util.find_spec("deep_ep") is not None
+
+if has_deep_ep:
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
+        DeepEPHTPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
+        DeepEPLLPrepareAndFinalize)
+
+    from .deepep_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
+
+requires_deep_ep = pytest.mark.skipif(
+    not has_deep_ep,
+    reason="Requires deep_ep kernels",
+)
+
+MAX_TOKENS_PER_RANK = 64
+
+
+def make_weights(
+        e, n, k, dtype
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Return weights w1, w2, w1_scale, w2_scale
+    """
+    if dtype in [torch.float16, torch.bfloat16]:
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+        return w1, w2, None, None
+
+    # per-out-channel weight quantization
+    assert dtype == torch.float8_e4m3fn
+    w1 = torch.empty((e, 2 * n, k), device="cuda", dtype=torch.float16)
+    w2 = torch.empty((e, k, n), device="cuda", dtype=torch.float16)
+
+    n_b_scales = 2 * n
+    k_b_scales = k
+    w1_q = torch.empty_like(w1, dtype=dtype)
+    w2_q = torch.empty_like(w2, dtype=dtype)
+    w1_scale = torch.empty((e, n_b_scales, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+    w2_scale = torch.empty((e, k_b_scales, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+    for expert in range(e):
+        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+            w1[expert], use_per_token_if_dynamic=True)
+        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+            w2[expert], use_per_token_if_dynamic=True)
+    return w1_q, w2_q, w1_scale, w2_scale
+
+
+@dataclasses.dataclass
+class TestConfig:
+    dtype: torch.dtype
+    topk: int
+    m: int
+    k: int
+    n: int
+    num_experts: int
+
+
+@dataclasses.dataclass
+class TestTensors:
+    rank_tokens: torch.Tensor  # all ranks make this many tokens
+    rank_token_scales: Optional[torch.Tensor]
+    topk: torch.Tensor
+    topk_weights: torch.Tensor
+    config: TestConfig
+
+    @staticmethod
+    def make(config: TestConfig, low_latency_mode: bool) -> "TestTensors":
+        # TODO (varun) - check that float16 works ?
+        assert config.dtype in [torch.bfloat16, torch.float8_e4m3fn]
+        token_dtype = (torch.bfloat16 if config.dtype == torch.float8_e4m3fn
+                       else config.dtype)
+        rank_tokens = torch.randn(
+            (config.m, config.k), device="cuda", dtype=token_dtype) / 10
+        rank_token_scales = None
+        if config.dtype == torch.float8_e4m3fn:
+            # low_latency_mode kernels dont support per-token quant.
+            _, rank_token_scales = ops.scaled_fp8_quant(
+                rank_tokens, use_per_token_if_dynamic=not low_latency_mode)
+
+        topk = torch.randint(low=0,
+                             high=config.num_experts,
+                             size=(config.m, config.topk),
+                             device="cuda").to(dtype=torch.int64)
+        topk_weights = torch.randn(topk.shape,
+                                   dtype=torch.float32,
+                                   device="cuda")
+        return TestTensors(rank_tokens=rank_tokens,
+                           rank_token_scales=rank_token_scales,
+                           topk=topk,
+                           topk_weights=topk_weights,
+                           config=config)
+
+
+def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                        low_latency_mode: bool, hidden_size: int, dp_size: int,
+                        num_experts: int, num_local_experts: int,
+                        q_dtype: Optional[torch.dtype],
+                        use_fp8_dispatch: bool) -> FusedMoEModularKernel:
+
+    is_quantized = q_dtype is not None
+
+    ht_args: Optional[DeepEPHTArgs] = None
+    ll_args: Optional[DeepEPLLArgs] = None
+
+    if low_latency_mode:
+        ll_args = DeepEPLLArgs(max_tokens_per_rank=MAX_TOKENS_PER_RANK,
+                               hidden_size=hidden_size,
+                               num_experts=num_experts,
+                               use_fp8_dispatch=use_fp8_dispatch)
+    else:
+        assert not use_fp8_dispatch, (
+            "FP8 Dispatch is valid only for low-latency kernels")
+        ht_args = DeepEPHTArgs(num_local_experts=num_local_experts)
+
+    a2a : Union[DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize] = \
+        make_deepep_a2a(pg = pg,
+                        pgi = pgi,
+                        dp_size = dp_size,
+                        q_dtype = q_dtype,
+                        block_shape = None,
+                        deepep_ht_args = ht_args,
+                        deepep_ll_args = ll_args)
+
+    if low_latency_mode:
+        fused_experts = BatchedTritonExperts(
+            max_num_tokens=MAX_TOKENS_PER_RANK,
+            world_size=pgi.world_size,
+            dp_size=dp_size,
+            use_fp8_w8a8=is_quantized,
+            use_int8_w8a8=False,
+            use_int8_w8a16=False,
+            use_int4_w4a16=False)
+    else:
+        fused_experts = TritonExperts(use_fp8_w8a8=is_quantized,
+                                      use_int8_w8a8=False,
+                                      use_int8_w8a16=False,
+                                      use_int4_w4a16=False,
+                                      per_channel_quant=False)
+
+    mk = FusedMoEModularKernel(prepare_finalize=a2a,
+                               fused_experts=fused_experts)
+    return mk
+
+
+def deep_ep_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                     low_latency_mode: bool, dp_size: int,
+                     test_tensors: TestTensors, w1: torch.Tensor,
+                     w2: torch.Tensor, w1_scale: Optional[torch.Tensor],
+                     w2_scale: Optional[torch.Tensor], num_experts: int,
+                     use_fp8_dispatch: bool) -> torch.Tensor:
+
+    num_local_experts = w1.size(0)
+
+    def build_expert_map():
+        num_local_experts = w1.size(0)
+        expert_map = torch.full((num_experts, ),
+                                fill_value=-1,
+                                dtype=torch.int32)
+        s = pgi.rank * num_local_experts
+        e = s + num_local_experts
+        expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
+        return expert_map.to(device=torch.cuda.current_device(),
+                             dtype=torch.int32)
+
+    hidden_size = test_tensors.rank_tokens.size(1)
+    is_quantized = w1.dtype == torch.float8_e4m3fn
+    q_dtype = None
+    if is_quantized:
+        q_dtype = torch.float8_e4m3fn
+
+    # Make modular kernel
+    mk: FusedMoEModularKernel = make_modular_kernel(pg, pgi, low_latency_mode,
+                                                    hidden_size, dp_size,
+                                                    num_experts,
+                                                    num_local_experts, q_dtype,
+                                                    use_fp8_dispatch)
+
+    out_hidden_states = torch.empty_like(test_tensors.rank_tokens)
+    total_num_tokens = test_tensors.rank_tokens.size(0)
+
+    def process_chunk(chunk_start, chunk_end, skip_result_store=False):
+        rank_tokens_chunk = test_tensors.rank_tokens[chunk_start:chunk_end]
+        topk_weights_chunk = test_tensors.topk_weights[chunk_start:chunk_end]
+        topk_chunk = test_tensors.topk[chunk_start:chunk_end]
+        rank_token_scales_chunk = test_tensors.rank_token_scales
+        if rank_token_scales_chunk is not None and rank_token_scales_chunk.size(
+                0) == total_num_tokens:
+            # per act token
+            rank_token_scales_chunk = rank_token_scales_chunk[
+                chunk_start:chunk_end]
+
+        out = mk.forward(hidden_states=rank_tokens_chunk,
+                         w1=w1,
+                         w2=w2,
+                         topk_weights=topk_weights_chunk,
+                         topk_ids=topk_chunk,
+                         inplace=False,
+                         activation="silu",
+                         global_num_experts=num_experts,
+                         expert_map=build_expert_map(),
+                         w1_scale=w1_scale,
+                         w2_scale=w2_scale,
+                         w1_zp=None,
+                         w2_zp=None,
+                         a1_scale=rank_token_scales_chunk,
+                         a2_scale=None,
+                         apply_router_weight_on_input=False)
+
+        if not skip_result_store:
+            out_hidden_states[chunk_start:chunk_end, :].copy_(
+                out, non_blocking=True)
+
+    max_num_tokens_per_dp = (MAX_TOKENS_PER_RANK
+                             if low_latency_mode else total_num_tokens)
+
+    for chunk_start_ in range(0, total_num_tokens, max_num_tokens_per_dp):
+        chunk_start = chunk_start_
+        chunk_end = min(chunk_start + max_num_tokens_per_dp, total_num_tokens)
+        # clamp start and end
+        chunk_start = min(chunk_start, total_num_tokens - 1)
+        chunk_end = min(chunk_end, total_num_tokens)
+
+        process_chunk(chunk_start,
+                      chunk_end,
+                      skip_result_store=chunk_start_ >= total_num_tokens)
+
+    return out_hidden_states
+
+
+def torch_moe_impl(test_tensors: TestTensors, w1: torch.Tensor,
+                   w2: torch.Tensor, w1_scale: Optional[torch.Tensor],
+                   w2_scale: Optional[torch.Tensor], using_fp8_dispatch: bool):
+
+    a, topk_ids, topk_weights = (test_tensors.rank_tokens, test_tensors.topk,
+                                 test_tensors.topk_weights)
+    if using_fp8_dispatch:
+        # The DeepEP implementation is requested to dispatch using FP8.
+        # For numerical stability for testing, emulate the fp8 dispatch by
+        # blockwise quant and de-quant.
+        a = test_tensors.rank_tokens
+        aq, aq_scale = per_token_group_quant_fp8(a, 128)
+        a = (aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1)).view(
+            a.shape).to(a.dtype)
+
+    is_quantized = w1.dtype == torch.float8_e4m3fn
+    a_dtype = a.dtype
+    if is_quantized:
+        w1 = w1.to(dtype=torch.float32) * w1_scale
+        w2 = w2.to(dtype=torch.float32) * w2_scale
+        a = a.to(dtype=torch.float32)
+
+    m, _ = a.shape
+    topk = topk_ids.size(1)
+    out = torch.zeros_like(a)
+
+    for i in range(m):
+        a_i = a[i]
+        o_i = out[i]
+        for j in range(topk):
+            e = topk_ids[i][j]
+            e_w = topk_weights[i][j]
+            w1_e = w1[e]
+            w2_e = w2[e]
+            o_i += (SiluAndMul()
+                    (a_i @ w1_e.transpose(0, 1)) @ w2_e.transpose(0, 1)) * e_w
+
+    if is_quantized:
+        out = out.to(dtype=a_dtype)
+
+    return out
+
+
+def _deep_ep_moe(
+    pgi: ProcessGroupInfo,
+    low_latency_mode: bool,
+    dp_size: int,
+    config: TestConfig,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: Optional[torch.Tensor],
+    w2_scale: Optional[torch.Tensor],
+    use_fp8_dispatch: bool,
+):
+
+    if not low_latency_mode:
+        assert not use_fp8_dispatch, (
+            "FP8 dispatch interface is available only in low-latency mode")
+
+    is_quantized = w1.dtype == torch.float8_e4m3fn
+    w1 = w1.to(device=torch.cuda.current_device())
+    w2 = w2.to(device=torch.cuda.current_device())
+    if is_quantized:
+        w1_scale = w1_scale.to(  # type: ignore
+            device=torch.cuda.current_device())
+        w2_scale = w2_scale.to(  # type: ignore
+            device=torch.cuda.current_device())
+
+    pg = torch.distributed.new_group(list(range(pgi.world_size)))
+    test_tensors = TestTensors.make(config, low_latency_mode)
+
+    with set_current_vllm_config(VllmConfig()):
+        # Reference
+        torch_combined = torch_moe_impl(test_tensors, w1, w2, w1_scale,
+                                        w2_scale, use_fp8_dispatch)
+
+        # Splice experts for this rank.
+        num_local_experts = config.num_experts // pgi.world_size
+        e_start = num_local_experts * pgi.rank
+        e_end = e_start + num_local_experts
+        w1_ep = w1[e_start:e_end]
+        w2_ep = w2[e_start:e_end]
+
+        w1_scale_ep, w2_scale_ep = None, None
+        if is_quantized:
+            w1_scale_ep = w1_scale[e_start:e_end]  # type: ignore
+            w2_scale_ep = w2_scale[e_start:e_end]  # type: ignore
+        deepep_combined = deep_ep_moe_impl(
+            pg,
+            pgi,
+            low_latency_mode,
+            dp_size,
+            test_tensors,
+            w1_ep,
+            w2_ep,
+            w1_scale_ep,
+            w2_scale_ep,
+            config.num_experts,
+            use_fp8_dispatch,
+        )
+
+    torch.testing.assert_close(
+        torch_combined,
+        deepep_combined,
+        atol=6e-2,
+        rtol=6e-2,
+    )
+
+
+MNKs = [
+    (1, 128, 128),
+    (2, 128, 512),
+    (3, 1024, 2048),
+    (32, 128, 1024),
+    (45, 512, 2048),
+    (64, 1024, 1024),
+    (222, 1024, 2048),
+]
+
+DTYPES = [torch.bfloat16, torch.float8_e4m3fn]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("num_experts", [32])
+@pytest.mark.parametrize("topk", [6])
+@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@requires_deep_ep
+def test_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
+                     num_experts: int, topk: int, world_dp_size: tuple[int,
+                                                                       int]):
+    low_latency_mode = False
+    use_fp8_dispatch = False
+    m, n, k = mnk
+
+    current_platform.seed_everything(7)
+    world_size, dp_size = world_dp_size
+    config = TestConfig(dtype=dtype,
+                        topk=topk,
+                        m=m,
+                        k=k,
+                        n=n,
+                        num_experts=num_experts)
+
+    w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype)
+
+    parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size,
+                    config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch)
+
+
+MNKs = [
+    (1, 128, 2560),
+    (2, 128, 2560),
+    (3, 1024, 2560),
+    (32, 128, 2560),
+    (45, 512, 2560),
+    (64, 1024, 2560),
+    (222, 1024, 2560),
+]
+DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
+USE_FP8_DISPATCH = [True, False]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("num_experts", [32])
+@pytest.mark.parametrize("topk", [6])
+@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
+@requires_deep_ep
+def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
+                                 num_experts: int, topk: int,
+                                 world_dp_size: tuple[int, int],
+                                 use_fp8_dispatch: bool):
+
+    low_latency_mode = True
+    m, n, k = mnk
+
+    if (low_latency_mode
+            and k not in DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES):
+        pytest.skip(
+            f"Skipping test as hidden size {k} is not in list of supported "
+            f"hidden sizes {DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES}"
+        )
+
+    current_platform.seed_everything(7)
+    world_size, dp_size = world_dp_size
+    config = TestConfig(dtype=dtype,
+                        topk=topk,
+                        m=m,
+                        k=k,
+                        n=n,
+                        num_experts=num_experts)
+
+    w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype)
+
+    parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size,
+                    config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch)
diff --git a/vllm/config.py b/vllm/config.py
index d99e501ca..f6ca9328b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1856,6 +1856,8 @@ class ParallelConfig:
         factors.append(self.pipeline_parallel_size)
         factors.append(self.tensor_parallel_size)
         factors.append(self.enable_expert_parallel)
+        factors.append(self.data_parallel_size)
+        factors.append(envs.VLLM_ALL2ALL_BACKEND)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __post_init__(self) -> None:
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index ae7590299..2ab3779ec 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib.util
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import torch
 import torch.distributed as dist
@@ -129,3 +129,147 @@ class PPLXAll2AllManager(All2AllManagerBase):
             from pplx_kernels.nvshmem import nvshmem_finalize
             logger.debug("PPLX NVSHMEM finalize")
             nvshmem_finalize()
+
+
+class DeepEPAll2AllManagerBase(All2AllManagerBase):
+    """
+    All2All communication based on DeepEP High-Throughput kernels.
+    """
+
+    def __init__(self, cpu_group):
+        has_deepep = importlib.util.find_spec("deep_ep") is not None
+        assert has_deepep, "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install DeepEP kernels."  # noqa
+        super().__init__(cpu_group)
+        self.handle_cache = Cache()
+
+        # This is the DeepEP default. Stick to it till we can establish
+        # reasonable defaults based on profiling.
+        self.num_sms = 20
+
+    def get_handle(self, kwargs):
+        raise NotImplementedError
+
+    def dispatch(self, hidden_states: torch.Tensor,
+                 router_logits: torch.Tensor):
+        raise NotImplementedError
+
+    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    def destroy(self):
+        pass
+
+
+class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
+    """
+    All2All communication based on DeepEP High-Throughput kernels.
+    """
+
+    def __init__(self, cpu_group):
+        super().__init__(cpu_group)
+
+    def _make_all2all_kwargs(self) -> dict[Any, Any]:
+        # Defaults for internode and intranode are taken from DeepEP tests.
+        num_nvl_bytes = 1024 * 1024 * 1024
+        num_rdma_bytes = None
+        num_qps_per_rank = None
+
+        if self.internode:
+            num_rdma_bytes = 1024 * 1024 * 1024
+            num_qps_per_rank = self.num_sms // 2
+        else:
+            assert self.intranode
+            num_rdma_bytes = 0
+            num_qps_per_rank = 1
+
+        assert num_rdma_bytes is not None
+        assert num_qps_per_rank is not None
+        return dict(group=self.cpu_group,
+                    num_nvl_bytes=num_nvl_bytes,
+                    num_rdma_bytes=num_rdma_bytes,
+                    low_latency_mode=False,
+                    num_qps_per_rank=num_qps_per_rank)
+
+    def get_handle(self, kwargs):
+
+        assert len(kwargs) == 0, (
+            "DeepEPHTAll2AllManager expects no arguments. All the required "
+            "args are computed in the Manager itself.")
+
+        import deep_ep
+        buffer_kwargs = self._make_all2all_kwargs()
+        logger.debug("DeepEP all2all args %s", buffer_kwargs)
+        handle: deep_ep.Buffer = self.handle_cache.get_or_create(
+            buffer_kwargs, deep_ep.Buffer)
+        # It is dangerous to set num sms outside this function. num_sms is not
+        # a part of the hash-key that identifies this object. If we are in a
+        # situation where we make objects with different num_sms, the hash key
+        # in get_or_create must be updated.
+        handle.set_num_sms(self.num_sms)
+        return handle
+
+
+class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
+    """
+    All2All communication based on DeepEP Low-Latency kernels.
+    """
+
+    def __init__(self, cpu_group):
+        super().__init__(cpu_group)
+
+    def _make_all2all_kwargs(
+        self,
+        max_num_tokens_per_dp_rank: int,
+        token_hidden_size: int,
+        num_ep_ranks: int,
+        num_global_experts: int,
+        num_local_experts: int,
+    ) -> dict[Any, Any]:
+        """
+        max_num_tokens_per_dp_rank : the maximum number of tokens a DP rank
+          can dispatch all the ranks must hold the same value.
+        token_hidden_size: the hidden dimension of each token.
+        num_ep_ranks: the number of EP group ranks.
+        num_global_experts: Number of experts in the model.
+        num_local_experts: Number of experts in an EP rank.
+        """
+        import deep_ep
+
+        # Defaults for internode and intranode are taken from DeepEP tests.
+        num_nvl_bytes = 1024 * 1024 * 1024
+        num_qps_per_rank = num_local_experts
+        num_rdma_bytes = None
+
+        if self.internode:
+            num_rdma_bytes = 1024 * 1024 * 1024
+        else:
+            assert self.intranode
+            num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
+                num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
+                hidden=token_hidden_size,
+                num_ranks=num_ep_ranks,
+                num_experts=num_global_experts)
+
+        assert num_rdma_bytes is not None
+        return dict(group=self.cpu_group,
+                    num_nvl_bytes=num_nvl_bytes,
+                    num_rdma_bytes=num_rdma_bytes,
+                    low_latency_mode=True,
+                    num_qps_per_rank=num_qps_per_rank)
+
+    def get_handle(self, kwargs):
+        """
+        The kwargs for DeepEPLLAll2AllManager is dictated by
+        _make_all2all_kwargs.
+        """
+        import deep_ep
+        buffer_kwargs = self._make_all2all_kwargs(**kwargs)
+        logger.debug("DeepEP all2all args %s", buffer_kwargs)
+        handle: deep_ep.Buffer = self.handle_cache.get_or_create(
+            buffer_kwargs, deep_ep.Buffer)
+        # It is dangerous to set num sms outside this function. num_sms is not
+        # a part of the hash-key that identifies this object. If we are in a
+        # situation where we make objects with different num_sms, the hash key
+        # in get_or_create must be updated.
+        handle.set_num_sms(self.num_sms)
+        return handle
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index 0eebdf873..055d91690 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -67,6 +67,14 @@ class CudaCommunicator(DeviceCommunicatorBase):
                 from .all2all import PPLXAll2AllManager
                 self.all2all_manager = PPLXAll2AllManager(self.cpu_group)
                 logger.info("Using PPLX all2all manager.")
+            elif all2all_backend == "deepep_high_throughput":
+                from .all2all import DeepEPHTAll2AllManager
+                self.all2all_manager = DeepEPHTAll2AllManager(self.cpu_group)
+                logger.info("Using DeepEP High-Throughput all2all manager.")
+            elif all2all_backend == "deepep_low_latency":
+                from .all2all import DeepEPLLAll2AllManager
+                self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group)
+                logger.info("Using DeepEP Low-Latency all2all manager.")
             else:
                 raise ValueError(f"Unknown all2all backend: {all2all_backend}")
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 2e3d6eeb5..08bf2dad4 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -826,6 +826,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Available options:
     # - "naive": naive all2all implementation using all-reduce
     # - "pplx": use pplx kernels
+    # - "deepep_high_throughput", use deepep high-throughput kernels
+    # - "deepep_low_latency", use deepep low-latency kernels
     "VLLM_ALL2ALL_BACKEND":
     lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
 
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 331544d64..97b4a49c0 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -12,8 +12,8 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
     _moe_permute)
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoEP)
-from vllm.model_executor.layers.fused_moe.utils import (_fp8_quantize,
-                                                        _resize_cache)
+from vllm.model_executor.layers.fused_moe.utils import (
+    _resize_cache, per_token_group_quant_fp8)
 from vllm.utils import round_up
 
 logger = init_logger(__name__)
@@ -34,10 +34,8 @@ def _valid_deep_gemm_shape(M: int, N: int, K: int):
     return align <= M and N % align == 0 and K % align == 0
 
 
-def _valid_deep_gemm(hidden_states: torch.Tensor,
-                     w1: torch.Tensor,
-                     w2: torch.Tensor,
-                     expert_map: Optional[torch.Tensor] = None) -> bool:
+def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
+                     w2: torch.Tensor) -> bool:
     """
     Check if the given problem size is supported by the DeepGemm grouped
     gemm kernel.  All of M, N, K and the quantization block_shape must be
@@ -47,10 +45,6 @@ def _valid_deep_gemm(hidden_states: torch.Tensor,
         logger.debug("DeepGemm disabled: deep_gemm not available.")
         return False
 
-    if expert_map is not None:
-        logger.debug("DeepGemm disabled: expert map NYI.")
-        return False
-
     M = hidden_states.size(0)
     _, K, N = w2.size()
     if not _valid_deep_gemm_shape(M, N, K):
@@ -116,7 +110,9 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         a1q = hidden_states
         _, N, K = w1.size()
 
-        assert global_num_experts != -1
+        if global_num_experts == -1:
+            global_num_experts = w1.size(0)
+
         assert w2.size(1) == K
 
         a1q, a1q_scale, _, expert_ids, inv_perm = _moe_permute(
@@ -128,6 +124,14 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
             self.block_shape[0],
         )
 
+        if expert_map is not None:
+            # DeepGemm (Grouped Contiguous) kernel needs a valid B index
+            # for all rows of A. To that effect, simply compute with
+            # the 0th weight matrix.
+            # Note that this relies on the fact that corresponding topk
+            # weights would be 0 during weight multiplication.
+            expert_ids = torch.where(expert_ids == -1, 0, expert_ids)
+
         # Note: M_sum is different than the pre-permuted shape of a1q.
         M_sum = a1q.size(0)
         workspace1 = _resize_cache(workspace13, (M_sum, N))
@@ -140,9 +144,9 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         self.activation(activation, workspace2, workspace1.view(-1, N))
 
         a2q_scale: Optional[torch.Tensor] = None
-
-        a2q, a2q_scale = _fp8_quantize(workspace2, a2_scale, False,
-                                       self.block_shape)
+        a2q, a2q_scale = per_token_group_quant_fp8(workspace2,
+                                                   self.block_shape[1],
+                                                   column_major_scales=True)
 
         dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
             (a2q, a2q_scale), (w2, w2_scale), workspace3, expert_ids)
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
new file mode 100644
index 000000000..48cf01638
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -0,0 +1,236 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import deep_ep
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
+
+
+class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
+    """
+    Prepare/Finalize using DeepEP High-Throughput kernels.
+    """
+
+    def __init__(self,
+                 buffer: deep_ep.Buffer,
+                 world_size: int,
+                 rank: int,
+                 dp_size: int,
+                 rank_expert_offset: int,
+                 quant_dtype: Optional[torch.dtype] = None,
+                 block_shape: Optional[list[int]] = None):
+        super().__init__()
+        self.buffer = buffer
+        self.world_size = world_size
+        self.rank = rank
+        self.dp_size = dp_size
+        self.rank_expert_offset = rank_expert_offset
+        self.quant_dtype = quant_dtype
+        self.block_shape = block_shape
+        # The dispatch function returns a handle that the combine function
+        # requires. We store the handle here so it is available to the
+        # combine function.
+        self.handle = None
+
+        # From https://github.com/deepseek-ai/DeepEP/blob/9fe9021f29c9083cd1808ab36b740208524d9f63/deep_ep/buffer.py#L164
+        self.available_rank_configs = [2, 4, 8, 16, 24, 32, 64, 128, 144, 160]
+
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return None
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return torch.int64
+
+    def _get_dispatch_config(self) -> Optional[deep_ep.Config]:
+        if self.dp_size not in self.available_rank_configs:
+            return None
+        return deep_ep.Buffer.get_dispatch_config(self.dp_size)
+
+    def _get_combine_config(self) -> Optional[deep_ep.Config]:
+        if self.dp_size not in self.available_rank_configs:
+            return None
+        return deep_ep.Buffer.get_combine_config(self.dp_size)
+
+    def _do_quant(self, tokens: torch.Tensor,
+                  token_scales: Optional[torch.Tensor], per_act_token: bool):
+        tokens, token_scales = moe_kernel_quantize_input(
+            tokens, token_scales, self.quant_dtype, per_act_token,
+            self.block_shape)
+        return tokens, token_scales
+
+    def _do_dispatch(self, tokens: torch.Tensor,
+                     token_scales: Optional[torch.Tensor],
+                     rank_topk_ids: torch.Tensor,
+                     rank_topk_weights: torch.Tensor, num_experts: int):
+
+        has_scales = token_scales is not None
+
+        (num_tokens_per_rank, num_tokens_per_rdma_rank, expert_num_tokens,
+         is_token_in_rank, event) = self.buffer.get_dispatch_layout(
+             topk_idx=rank_topk_ids,
+             num_experts=num_experts,
+             previous_event=None,
+             async_finish=False,
+             allocate_on_comm_stream=False)
+
+        token_data = tokens
+        if has_scales:
+            token_data = (tokens, token_scales)
+
+        (
+            token_data, expert_topk_ids, expert_topk_weights,
+            expert_num_tokens_per_expert_list, self.handle, event
+        ) = self.buffer.dispatch(
+            x=token_data,
+            handle=None,
+            num_tokens_per_rank=num_tokens_per_rank,
+            num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+            is_token_in_rank=is_token_in_rank,
+            num_tokens_per_expert=expert_num_tokens,
+            topk_idx=rank_topk_ids,
+            topk_weights=rank_topk_weights,
+            # expert_alignment rounds the number of tokens per expert
+            # to this value.
+            expert_alignment=1,
+            config=self._get_dispatch_config(),
+            previous_event=None,
+            async_finish=False,
+            allocate_on_comm_stream=False)
+
+        if has_scales:
+            expert_x, expert_x_scale = token_data
+        else:
+            expert_x, expert_x_scale = token_data, None
+
+        # The existing MOE kernels assume that all entries of topk_ids are
+        # valid. To that effect, set the -1s in expert_topk_ids to some expert
+        # outside this rank so the expert_map can remap it to -1 when safe.
+        # With Expert Parallel, the experts are divided amongst the rank
+        # sequentially. For rank 0, set it to num_experts - 1 and for all other
+        # ranks set it to 0 as we know that expert_map will have a -1 in those
+        # regions for those ranks.
+        #
+        # DeepEP's topk_ids output refers to the local experts directly. Offset
+        # the topk_ids to move it back to the global experts space so it aligns
+        # with existing vLLM interfaces.
+        expert_topk_ids = torch.where(
+            expert_topk_ids == -1,
+            num_experts - 1 if self.rank_expert_offset == 0 else 0,
+            expert_topk_ids + self.rank_expert_offset)
+
+        return (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids,
+                expert_topk_weights)
+
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        rank_topk_weights: torch.Tensor,
+        rank_topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+        if apply_router_weight_on_input:
+            topk = rank_topk_ids.size(1)
+            # TODO: this only works for topK=1, will need to update for topK>1
+            assert topk == 1, (
+                "apply_router_weight_on_input is only implemented for topk=1")
+            a1 = a1 * rank_topk_weights.to(a1.dtype)
+
+        # Check if there is a block_shape / or if we can infer the quantization
+        # schemes from the scales.
+        per_token_quant = None
+        if all([x is None for x in [self.block_shape, a1_scale, a2_scale]
+                ]) and self.quant_dtype is not None:
+            # Quantization required despite none of the inputs suggesting
+            # quantization. Fallback to per_token_dynamic quant.
+            per_token_quant = True
+        else:
+            per_token_quant = ((self.block_shape is not None) or
+                               (a1_scale is not None and a1_scale.numel() != 1)
+                               or (a2_scale is not None
+                                   and a2_scale.numel() != 1))
+
+        if per_token_quant:
+            a1q, a1q_scale = self._do_quant(a1, a1_scale, per_act_token=True)
+            (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids,
+             expert_topk_weights) = self._do_dispatch(
+                 tokens=a1q,
+                 token_scales=a1q_scale,
+                 rank_topk_ids=rank_topk_ids,
+                 rank_topk_weights=rank_topk_weights,
+                 num_experts=num_experts)
+        else:
+            # DeepEP kernels only support dispatching per-token-quant
+            # quantization. dispatch in bfloat16.
+            (expert_x, _, expert_num_tokens, expert_topk_ids,
+             expert_topk_weights) = self._do_dispatch(
+                 tokens=a1,
+                 token_scales=None,
+                 rank_topk_ids=rank_topk_ids,
+                 rank_topk_weights=rank_topk_weights,
+                 num_experts=num_experts)
+            # quantize now
+            expert_x_scale = None
+            if expert_x.numel() != 0:
+                expert_x, expert_x_scale = self._do_quant(expert_x,
+                                                          a1_scale,
+                                                          per_act_token=False)
+
+        return (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids,
+                expert_topk_weights)
+
+    def _apply_weights_and_reduce(self, num_tokens: int,
+                                  fused_expert_output: torch.Tensor,
+                                  topk_weights: torch.Tensor,
+                                  apply_router_weight_on_input: bool,
+                                  output_dtype: torch.dtype):
+
+        if fused_expert_output.ndim == 2:
+            hidden_dim = fused_expert_output.size(-1)
+            fused_expert_output = fused_expert_output.view(
+                num_tokens, -1, hidden_dim)
+
+        if not apply_router_weight_on_input:
+            # The DeepEP combine kernels don't do the topk weight
+            # multiplication. We multiply the weights locally.
+            fused_expert_output = fused_expert_output.to(torch.float32)
+            fused_expert_output = fused_expert_output * topk_weights.view(
+                fused_expert_output.size(0), -1, 1)
+            fused_expert_output = fused_expert_output.to(output_dtype)
+
+        return fused_expert_output.sum(dim=1).to(output_dtype)
+
+    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
+                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 apply_router_weight_on_input: bool) -> None:
+
+        assert self.handle is not None
+
+        # fused_expert_output can have 0 tokens - This happens when none of the
+        # tokens from the all2all reach this EP rank.
+        if fused_expert_output.numel() != 0:
+            fused_expert_output = self._apply_weights_and_reduce(
+                num_tokens=topk_ids.size(0),
+                fused_expert_output=fused_expert_output,
+                topk_weights=topk_weights,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                output_dtype=output.dtype)
+
+        combined_x, _, event = self.buffer.combine(
+            x=fused_expert_output,
+            handle=self.handle,
+            topk_weights=None,
+            config=self._get_combine_config(),
+            previous_event=None,
+            async_finish=False,
+            allocate_on_comm_stream=False)
+        # Respect inplace outputs.
+        output.copy_(combined_x, non_blocking=True)
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
new file mode 100644
index 000000000..b9d817a14
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import deep_ep
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
+
+# DeepEP kernels quantize dispatch inputs in 128 element chunks.
+DEEPEP_QUANT_BLOCK_SIZE = 128
+
+
+def dequant_fp8(expert_x_fp8: torch.Tensor,
+                expert_x_scales: torch.Tensor) -> torch.Tensor:
+    """
+    Return dequantized tensor in fp32
+    """
+    # TODO (varun) : Optimize leverage num_tokens_per_expert counts
+    assert expert_x_fp8.is_contiguous()
+    expert_x_scales = expert_x_scales.contiguous()
+    num_experts = expert_x_fp8.size(0)
+
+    expert_x_fp32 = expert_x_fp8.to(torch.float32).view(
+        num_experts, -1, DEEPEP_QUANT_BLOCK_SIZE)
+    expert_x_scales = expert_x_scales.view(num_experts, -1, 1)
+    return (expert_x_fp32 * expert_x_scales).view(expert_x_fp8.shape)
+
+
+class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
+    """
+    Prepare/Finalize using DeepEP low-latency kernels.
+    """
+
+    # DeepEP low-latency kernels are compiled only for certain
+    # specific hidden sizes.
+    SUPPORTED_HIDDEN_SIZES = [2560, 4096, 5120, 7168]
+
+    def __init__(self,
+                 buffer: deep_ep.Buffer,
+                 world_size: int,
+                 dp_size: int,
+                 max_tokens_per_rank: int,
+                 quant_dtype: Optional[torch.dtype] = None,
+                 block_shape: Optional[list[int]] = None,
+                 use_fp8_dispatch: bool = False):
+        super().__init__()
+
+        self.buffer = buffer
+        self.world_size = world_size
+        self.dp_size = dp_size
+        self.quant_dtype = quant_dtype
+        self.block_shape = block_shape
+        self.max_tokens_per_rank = max_tokens_per_rank
+        self.use_fp8_dispatch = use_fp8_dispatch
+        # The dispatch function returns a handle that the combine function
+        # requires. We store the handle here so it is available to the
+        # combine function.
+        self.handle = None
+
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return self.max_tokens_per_rank
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return torch.int64
+
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        rank_topk_weights: torch.Tensor,
+        rank_topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+        hidden_size = a1.size(1)
+        assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, \
+            (f"Hidden Size {hidden_size} not in supported list of hidden sizes"
+            f"{self.SUPPORTED_HIDDEN_SIZES}")
+
+        if self.use_fp8_dispatch:
+            assert hidden_size % 128 == 0, \
+            "DeepEP kernels quantize the inputs in blocks of shape 128"
+
+        # Quantize
+        per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
+            a2_scale.numel() != 1 if a2_scale is not None else False)
+        assert not per_act_token, (
+            "low_latency kernels don't support per-act-token quant")
+
+        if apply_router_weight_on_input:
+            topk = rank_topk_ids.size(1)
+            # TODO: this only works for topK=1, will need to update for topK>1
+            assert topk == 1, (
+                "apply_router_weight_on_input is only implemented for topk=1")
+            a1 = a1 * rank_topk_weights.to(a1.dtype)
+
+        # Dispatch
+        expert_x, expert_num_tokens, self.handle, event, hook = \
+                self.buffer.low_latency_dispatch(a1,
+                                                rank_topk_ids,
+                                                self.max_tokens_per_rank,
+                                                num_experts,
+                                                use_fp8=self.use_fp8_dispatch,
+                                                async_finish=False,
+                                                return_recv_hook=False)
+
+        if self.use_fp8_dispatch:
+            # TODO (varun) : In the case of dynamic quantization, we could
+            # probably skip the quant below and use the results directly.
+            # Although note that the deepep quant is per token 128 elements.
+            expert_x_fp8, expert_x_scales = expert_x
+            expert_x = dequant_fp8(expert_x_fp8,
+                                   expert_x_scales).to(dtype=a1.dtype)
+
+        num_experts = expert_x.size(0)
+        hidden_dim = expert_x.size(-1)
+
+        expert_x = expert_x.view((-1, expert_x.size(-1)))
+        expert_x, expert_x_scale = moe_kernel_quantize_input(
+            expert_x, a1_scale, self.quant_dtype, per_act_token,
+            self.block_shape)
+        expert_x = expert_x.view((num_experts, -1, hidden_dim))
+
+        return (expert_x, expert_x_scale, expert_num_tokens, None, None)
+
+    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
+                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 apply_router_weight_on_input: bool) -> None:
+
+        assert self.handle is not None
+
+        combine_topk_weights = topk_weights
+        if apply_router_weight_on_input:
+            # weights have already been applied.
+            combine_topk_weights = torch.ones_like(topk_weights)
+
+        # TODO (varun) : Enable zero copy mode
+        _, event, hook = self.buffer.low_latency_combine(
+            fused_expert_output,
+            topk_ids,
+            combine_topk_weights,
+            self.handle,
+            async_finish=False,
+            zero_copy=False,
+            return_recv_hook=False,
+            out=output)
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index 205a95e7f..7490a192d 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -10,7 +10,8 @@ import triton.language as tl
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     get_config_dtype_str, try_get_optimal_moe_config)
-from vllm.model_executor.layers.fused_moe.utils import _resize_cache
+from vllm.model_executor.layers.fused_moe.utils import (
+    _resize_cache, moe_kernel_quantize_input)
 
 
 @triton.jit
@@ -397,6 +398,12 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         self.rank = rank
         self.max_num_tokens = max_num_tokens
 
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return self.max_num_tokens
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return None
+
     def prepare(
         self,
         a1: torch.Tensor,
@@ -407,7 +414,8 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
         assert a1.dim() == 2
         assert topk_ids.dim() == 2
         assert topk_ids.size(0) == a1.size(0)
@@ -450,7 +458,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                  first_expert, :rows, :] = a1[:topks.numel()][topks]
             tokens_per_expert[expert_id - first_expert] = rows
 
-        return b_a1, a1_scale, tokens_per_expert
+        return b_a1, a1_scale, tokens_per_expert, None, None
 
     def finalize(
         self,
@@ -601,6 +609,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         use_int8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
+        per_channel_quant: bool = False,
         block_shape: Optional[list[int]] = None,
         world_size: int = 1,
         dp_size: int = 1,
@@ -611,12 +620,15 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         self.use_int4_w4a16 = use_int4_w4a16
         self.use_int8_w8a16 = use_int8_w8a16
         self.block_shape = block_shape
+        self.per_channel_quant = per_channel_quant
         self.max_num_tokens = max_num_tokens
-        assert not use_int8_w8a8, "NYI"
-        assert not use_int4_w4a16, "NYI"
         self.world_size = world_size
         self.dp_size = dp_size
 
+        assert not use_int8_w8a8, "NYI"
+        assert not use_int4_w4a16, "NYI"
+        assert self.block_shape is None, "NYI"
+
     def workspace_shapes(
         self,
         a: torch.Tensor,
@@ -670,8 +682,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn
         ]
 
-        # TODO: num_tokens -> max_num_tokens?
-        E, num_tokens, N, K, top_k_num = mk._moe_problem_size(
+        E, max_num_tokens, N, K, top_k_num = mk._moe_problem_size(
             hidden_states, w1, w2, topk_ids)
 
         assert w1.size(0) == E
@@ -687,7 +698,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             w2.size(),
             top_k_num,
             config_dtype,
-            num_tokens,
+            max_num_tokens,
             block_shape=self.block_shape,
         )
 
@@ -706,10 +717,12 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         #print(f"shape: E={E}, M={num_tokens}, N={N}, K={K}, top_k={top_k_num}")
         # We can reuse the memory between these because by the time we need
         # cache3, we're done with cache1
-        intermediate_cache1 = _resize_cache(workspace13, (E, num_tokens, N))
+        intermediate_cache1 = _resize_cache(workspace13,
+                                            (E, max_num_tokens, N))
         intermediate_cache2 = _resize_cache(workspace2,
-                                            (E, num_tokens, N // 2))
-        intermediate_cache3 = _resize_cache(workspace13, (E, num_tokens, K))
+                                            (E, max_num_tokens, N // 2))
+        intermediate_cache3 = _resize_cache(workspace13,
+                                            (E, max_num_tokens, K))
 
         # MM1
         invoke_moe_batched_triton_kernel(A=hidden_states,
@@ -731,15 +744,20 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         self.activation(activation, intermediate_cache2.view(-1, N // 2),
                         intermediate_cache1.view(-1, N))
 
-        #qintermediate_cache2 = intermediate_cache2
-        a2q_scale = a2_scale
-        # TODO (varun) : support w8a8
-        assert not self.use_fp8_w8a8
-        #if self.use_fp8_w8a8:
-        #    qintermediate_cache2, a2q_scale = _fp8_quantize(
-        #        intermediate_cache2, a2_scale, self.block_shape)
+        ic2_hidden_size = intermediate_cache2.size(-1)
+        intermediate_cache2 = intermediate_cache2.view(-1, ic2_hidden_size)
+
+        qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
+            A=intermediate_cache2,
+            A_scale=a2_scale,
+            qtype=torch.float8_e4m3fn if self.use_fp8_w8a8 else None,
+            per_channel_quant=self.per_channel_quant,
+            block_shape=self.block_shape)
 
-        invoke_moe_batched_triton_kernel(A=intermediate_cache2,
+        qintermediate_cache2 = qintermediate_cache2.view(
+            (E, -1, ic2_hidden_size))
+
+        invoke_moe_batched_triton_kernel(A=qintermediate_cache2,
                                          B=w2,
                                          C=intermediate_cache3,
                                          expert_num_tokens=expert_num_tokens,
@@ -752,5 +770,4 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
                                          use_int4_w4a16=self.use_int4_w4a16,
                                          config=config,
                                          block_shape=self.block_shape)
-
         return intermediate_cache3
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 883a48c98..de7a9a8d0 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1164,7 +1164,7 @@ def fused_experts(hidden_states: torch.Tensor,
     # permute/unpermute ops are available.
     N = w1.shape[1]
     if (allow_deep_gemm and use_fp8_w8a8 and N > 512
-            and _valid_deep_gemm(hidden_states, w1, w2, expert_map)):
+            and _valid_deep_gemm(hidden_states, w1, w2)):
         assert apply_router_weight_on_input is False
         return deep_gemm_moe_fp8(
             hidden_states=hidden_states,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 3ce4cbc28..1812f3b67 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -5,7 +5,7 @@ import importlib
 from abc import abstractmethod
 from dataclasses import dataclass
 from enum import Enum
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -30,16 +30,19 @@ from vllm.platforms.interface import CpuArchEnum
 from vllm.utils import direct_register_custom_op
 
 has_pplx = importlib.util.find_spec("pplx_kernels") is not None
+has_deepep = importlib.util.find_spec("deep_ep") is not None
 
 if current_platform.is_cuda_alike():
-    from .fused_batched_moe import (BatchedPrepareAndFinalize,
-                                    BatchedTritonExperts)
+    from .fused_batched_moe import BatchedTritonExperts
     from .fused_moe import TritonExperts, fused_experts
     from .modular_kernel import (FusedMoEModularKernel,
                                  FusedMoEPermuteExpertsUnpermute,
                                  FusedMoEPrepareAndFinalize)
     if has_pplx:
         from .pplx_prepare_finalize import PplxPrepareAndFinalize
+    if has_deepep:
+        from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
+        from .deepep_ll_prepare_finalize import DeepEPLLPrepareAndFinalize
 else:
     fused_experts = None  # type: ignore
     FusedMoEPermuteExpertsUnpermute = None  # type: ignore
@@ -71,10 +74,24 @@ class FusedMoEParallelConfig:
 
     use_ep: bool  # whether to use EP or not
 
+    @property
+    def use_all2all_kernels(self):
+        return self.dp_size > 1 and self.use_ep
+
     @property
     def use_pplx_kernels(self):
-        return self.dp_size > 1 and self.use_ep and \
-             envs.VLLM_ALL2ALL_BACKEND == "pplx"
+        return (self.use_all2all_kernels
+                and envs.VLLM_ALL2ALL_BACKEND == "pplx")
+
+    @property
+    def use_deepep_ht_kernels(self):
+        return (self.use_all2all_kernels
+                and envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput")
+
+    @property
+    def use_deepep_ll_kernels(self):
+        return (self.use_all2all_kernels
+                and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency")
 
     @staticmethod
     def make(tp_size_: int, dp_size_: int,
@@ -231,6 +248,14 @@ class MoEConfig:
     def use_pplx_kernels(self):
         return self.moe_parallel_config.use_pplx_kernels
 
+    @property
+    def use_deepep_ht_kernels(self):
+        return self.moe_parallel_config.use_deepep_ht_kernels
+
+    @property
+    def use_deepep_ll_kernels(self):
+        return self.moe_parallel_config.use_deepep_ll_kernels
+
 
 class FusedMoeWeightScaleSupported(Enum):
     TENSOR = "tensor"
@@ -252,7 +277,16 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         all2all_manager = get_ep_group().device_communicator.all2all_manager
         assert all2all_manager is not None
 
-        prepare_finalize = None
+        quant_dtype = None
+        act_quant_block_size = None
+        from vllm.model_executor.layers.quantization.fp8 import Fp8Config
+        if isinstance(quant_config, Fp8Config):
+            act_quant_block_size = quant_config.weight_block_size
+            quant_dtype = torch.float8_e4m3fn
+
+        prepare_finalize: Optional[Union[PplxPrepareAndFinalize,
+                                         DeepEPHTPrepareAndFinalize,
+                                         DeepEPLLPrepareAndFinalize]] = None
         if moe.use_pplx_kernels:
             all_to_all_args = dict(
                 max_num_tokens=moe.max_num_tokens,
@@ -288,8 +322,49 @@ class FusedMoEMethodBase(QuantizeMethodBase):
                 dp_size=all2all_manager.tp_group.world_size,
                 quant_dtype=moe.in_dtype,
             )
+        elif moe.use_deepep_ht_kernels:
+            assert moe.dp_size == all2all_manager.dp_world_size
+
+            all_to_all_args = dict()
+            handle = all2all_manager.get_handle(all_to_all_args)
+            prepare_finalize = DeepEPHTPrepareAndFinalize(
+                handle,
+                world_size=all2all_manager.world_size,
+                rank=all2all_manager.rank,
+                dp_size=all2all_manager.dp_world_size,
+                rank_expert_offset=all2all_manager.rank *
+                moe.num_local_experts,
+                quant_dtype=quant_dtype,
+                block_shape=act_quant_block_size,
+            )
+
+        elif moe.use_deepep_ll_kernels:
+            assert moe.dp_size == all2all_manager.dp_world_size
 
+            all_to_all_args = dict(
+                max_num_tokens_per_dp_rank=moe.max_num_tokens,
+                token_hidden_size=moe.hidden_dim,
+                num_ep_ranks=all2all_manager.world_size,
+                num_global_experts=moe.num_experts,
+                num_local_experts=moe.num_experts //
+                all2all_manager.world_size)
+            handle = all2all_manager.get_handle(all_to_all_args)
+
+            # Note (varun): Whether to use FP8 dispatch or not needs some
+            # profiling. Turning it off for now.
+            prepare_finalize = DeepEPLLPrepareAndFinalize(
+                handle,
+                world_size=all2all_manager.world_size,
+                dp_size=all2all_manager.dp_world_size,
+                max_tokens_per_rank=moe.max_num_tokens,
+                quant_dtype=quant_dtype,
+                block_shape=act_quant_block_size,
+                use_fp8_dispatch=False,
+            )
+
+        self.topk_indices_dtype = None
         if prepare_finalize is not None:
+            self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
             experts = self.select_gemm_impl(prepare_finalize)
             self.fused_experts = FusedMoEModularKernel(
                 prepare_finalize,
@@ -297,7 +372,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
             )
 
     def select_gemm_impl(
-        self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize]
+        self, prepare_finalize: FusedMoEPrepareAndFinalize
     ) -> FusedMoEPermuteExpertsUnpermute:
         # based on the all2all implementation, select the appropriate
         # gemm implementation
@@ -334,6 +409,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     def __init__(self, moe: MoEConfig):
         super().__init__()
         self.fused_experts = fused_experts  # type: ignore
+        self.topk_indices_dtype = None
         self.moe = moe
 
         self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
@@ -343,8 +419,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         else:
             self.rocm_aiter_fused_experts = None  # type: ignore
 
-    def select_gemm_impl(
-            self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize]):
+    def select_gemm_impl(self, prepare_finalize: FusedMoEPrepareAndFinalize):
 
         assert self.fused_experts == fused_experts
 
@@ -353,11 +428,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
 
         experts: Optional[FusedMoEPermuteExpertsUnpermute] = None
 
-        if isinstance(prepare_finalize,
-                      (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)):
+        use_batched_experts = prepare_finalize.max_num_tokens_per_rank(
+        ) is not None
+        if use_batched_experts:
             logger.debug("BatchedTritonExperts %s", self.moe)
+            assert self.moe.dp_size == all2all_manager.dp_world_size
             experts = BatchedTritonExperts(
-                max_num_tokens=MOE_DP_CHUNK_SIZE,
+                max_num_tokens=self.moe.max_num_tokens,
                 world_size=all2all_manager.world_size,
                 # dp_size actually means tp_size, bug in pplx kernels
                 dp_size=all2all_manager.tp_group.world_size,
@@ -366,6 +443,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 use_int8_w8a16=False,
                 use_int4_w4a16=False,
                 block_shape=None,
+                per_channel_quant=False,
             )
         else:
             logger.debug("TritonExperts %s", self.moe)
@@ -494,6 +572,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
+
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -505,7 +584,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=torch.uint32 if self.moe.use_pplx_kernels else None)
+            indices_type=self.topk_indices_dtype)
 
         if self.rocm_aiter_moe_enabled:
             assert expert_map is None
@@ -806,11 +885,8 @@ class FusedMoE(torch.nn.Module):
         # Note: get_quant_method will look at the layer's local_num_experts
         # for heuristic purposes, so it must be initialized first.
         quant_method: Optional[QuantizeMethodBase] = None
-
-        if quant_config is None:
-            quant_method = UnquantizedFusedMoEMethod(moe)
-        else:
-            quant_method = quant_config.get_quant_method(self, prefix)
+        quant_method = (UnquantizedFusedMoEMethod(moe) if quant_config is None
+                        else quant_config.get_quant_method(self, prefix))
 
         assert quant_method is not None
         assert isinstance(quant_method, FusedMoEMethodBase)
@@ -836,7 +912,8 @@ class FusedMoE(torch.nn.Module):
         # Chunked all2all staging tensor
         self.batched_hidden_states: Optional[torch.Tensor] = None
         self.batched_router_logits: Optional[torch.Tensor] = None
-        if self.moe_parallel_config.use_pplx_kernels:
+        if (self.moe_parallel_config.use_pplx_kernels
+                or self.moe_parallel_config.use_deepep_ll_kernels):
             act_dtype = vllm_config.model_config.dtype
             self.batched_hidden_states = torch.zeros(
                 (MOE_DP_CHUNK_SIZE, self.hidden_size),
@@ -880,6 +957,14 @@ class FusedMoE(torch.nn.Module):
     def use_pplx_kernels(self):
         return self.moe_parallel_config.use_pplx_kernels
 
+    @property
+    def use_deepep_ht_kernels(self):
+        return self.moe_parallel_config.use_deepep_ht_kernels
+
+    @property
+    def use_deepep_ll_kernels(self):
+        return self.moe_parallel_config.use_deepep_ll_kernels
+
     def _load_per_tensor_weight_scale(self, shard_id: str,
                                       param: torch.nn.Parameter,
                                       loaded_weight: torch.Tensor,
@@ -1210,19 +1295,21 @@ class FusedMoE(torch.nn.Module):
         When just tensor-parallel is used, it is not required to reduce
         the shared_experts results immediately. Instead we reduce at the
         once at the end of the MoE op. (Refer to DeepSeekV2MoE module)
-        With EP and the pplx kernels - this is no longer viable as all
+        With EP and all2all kernels - this is no longer viable as all
         GPU ranks in DP, produce the complete set of hidden_states.
         Therefore it is required that we reduce the shared_experts output
         early.
         """
-        return self.use_pplx_kernels
+        return (self.use_pplx_kernels or self.use_deepep_ht_kernels
+                or self.use_deepep_ll_kernels)
 
     def maybe_all_reduce_tensor_model_parallel(
             self, final_hidden_states: torch.Tensor):
         """
         The pplx combine kernel reduces across GPU ranks by default.
         """
-        if self.use_pplx_kernels:
+        if (self.use_pplx_kernels or self.use_deepep_ht_kernels
+                or self.use_deepep_ll_kernels):
             return final_hidden_states
         else:
             return tensor_model_parallel_all_reduce(final_hidden_states)
@@ -1289,7 +1376,7 @@ class FusedMoE(torch.nn.Module):
 
         ctx = get_forward_context()
         max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu
-        moe_dp_chunk_size_per_rank = MOE_DP_CHUNK_SIZE
+        moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens
 
         num_tokens = full_hidden_states.size(0)
         for chunk_start_ in range(0, max_tokens_across_dp,
@@ -1310,12 +1397,17 @@ class FusedMoE(torch.nn.Module):
     def forward_impl(self, hidden_states: torch.Tensor,
                      router_logits: torch.Tensor):
         assert self.quant_method is not None
-        if self.moe_parallel_config.use_pplx_kernels:
+        if (self.moe_parallel_config.use_pplx_kernels
+                or self.moe_parallel_config.use_deepep_ll_kernels):
             return self.forward_impl_chunked(hidden_states, router_logits)
 
-        if self.dp_size > 1:
+        do_naive_dispatch_combine: bool = (
+            self.dp_size > 1
+            and not self.moe_parallel_config.use_deepep_ht_kernels)
+        if do_naive_dispatch_combine:
             hidden_states, router_logits = get_ep_group().dispatch(
                 hidden_states, router_logits)
+
         # Matrix multiply.
         final_hidden_states = self.quant_method.apply(
             layer=self,
@@ -1335,12 +1427,12 @@ class FusedMoE(torch.nn.Module):
             apply_router_weight_on_input=self.apply_router_weight_on_input,
         )
 
-        if self.dp_size > 1:
+        if do_naive_dispatch_combine:
             final_hidden_states = get_ep_group().combine(final_hidden_states)
 
         if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
-            # Default set to False. (May have to add shared expert outputs.)
-            final_hidden_states = tensor_model_parallel_all_reduce(
+            # Default set to False. (May have to add shared expert outputs.
+            final_hidden_states = self.maybe_all_reduce_tensor_model_parallel(
                 final_hidden_states)
 
         return final_hidden_states
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 5e321c9b4..2c27d31eb 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -94,7 +94,8 @@ class FusedMoEPrepareAndFinalize(ABC):
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
         """
         Perform any quantization (and/or) dispatching needed
         for this kernel.
@@ -113,6 +114,10 @@ class FusedMoEPrepareAndFinalize(ABC):
         Returns a tuple of:
         - quantized + dispatched a.
         - quantized + dispatched a1_scales.
+        - Optional tensor as big as number of local experts that contains the
+          number of tokens assigned to each local expert. 
+        - Optional dispatched expert topk IDs
+        - Optional dispatched expert topk weight 
         """
         raise NotImplementedError
 
@@ -138,6 +143,27 @@ class FusedMoEPrepareAndFinalize(ABC):
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        """
+        The PrepareFinalize All2All implementations generally constrain the
+        dtype of the topk_ids they support. This function returns the
+        required topk indices dtype so it can be respected.
+        Return None if there are no such restrictions.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        """
+        Some PrepareFinalize All2All implementations are batched. Meaning,
+        they can processes only as set of tokens at a time. This
+        function returns the batch size i.e the maximum number of tokens
+        the implementation can process at a time. 
+        Return None if there are no such restrictions.
+        """
+        raise NotImplementedError
+
 
 class FusedMoEPermuteExpertsUnpermute(ABC):
     """
@@ -261,6 +287,61 @@ class FusedMoEModularKernel(torch.nn.Module):
         self.prepare_finalize = prepare_finalize
         self.fused_experts = fused_experts
 
+    def _do_fused_experts(
+            self,
+            a1: torch.Tensor,  # input to forward fn
+            a1q: torch.Tensor,  # output of prepare fn
+            w1: torch.Tensor,
+            w2: torch.Tensor,
+            topk_ids: torch.Tensor,
+            expert_num_tokens: torch.Tensor,
+            activation: str,
+            global_num_experts: int,
+            expert_map: Optional[torch.Tensor],
+            w1_scale: Optional[torch.Tensor],
+            w2_scale: Optional[torch.Tensor],
+            w1_zp: Optional[torch.Tensor],
+            w2_zp: Optional[torch.Tensor],
+            a1q_scale: Optional[torch.Tensor],
+            a2_scale: Optional[torch.Tensor]) -> torch.Tensor:
+
+        _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids)
+
+        # Use a1 here to decipher the correct workspace datatype
+        workspace13_shape, workspace2_shape, workspace_dtype = (
+            self.fused_experts.workspace_shapes(a1, M, N, K, top_k,
+                                                global_num_experts))
+
+        # We can reuse the memory between cache1 and cache3 because by the time
+        # we need cache3, we're done with cache1
+        workspace13 = torch.zeros(workspace13_shape,
+                                  device=a1.device,
+                                  dtype=workspace_dtype)
+        workspace2 = torch.zeros(workspace2_shape,
+                                 device=a1.device,
+                                 dtype=workspace_dtype)
+
+        fused_out = self.fused_experts.apply(
+            a1q,
+            w1,
+            w2,
+            topk_ids,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            w1_zp=w1_zp,
+            w2_zp=w2_zp,
+            a1q_scale=a1q_scale,
+            a2_scale=a2_scale,
+            workspace13=workspace13,
+            workspace2=workspace2,
+            expert_num_tokens=expert_num_tokens,
+        )
+
+        return fused_out
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -315,49 +396,48 @@ class FusedMoEModularKernel(torch.nn.Module):
         Returns:
         - torch.Tensor: The output tensor after applying the MoE layer.
         """
-        a1 = hidden_states
-        E, M, N, K, top_k = _moe_problem_size(a1, w1, w2, topk_ids)
-
-        if global_num_experts == -1:
-            global_num_experts = E
 
+        a1 = hidden_states
         output = a1 if inplace else torch.zeros_like(a1)
 
-        workspace13_shape, workspace2_shape, workspace_dtype = (
-            self.fused_experts.workspace_shapes(a1, M, N, K, top_k,
-                                                global_num_experts))
-
-        # We can reuse the memory between cache1 and cache3 because by the time
-        # we need cache3, we're done with cache1
-        workspace13 = torch.zeros(workspace13_shape,
-                                  device=a1.device,
-                                  dtype=workspace_dtype)
-        workspace2 = torch.zeros(workspace2_shape,
-                                 device=a1.device,
-                                 dtype=workspace_dtype)
-
-        a1q, a1q_scale, expert_num_tokens = self.prepare_finalize.prepare(
-            a1, a1_scale, a2_scale, topk_weights, topk_ids, global_num_experts,
-            expert_map, apply_router_weight_on_input)
-
-        fused_out = self.fused_experts.apply(
-            a1q,
-            w1,
-            w2,
-            topk_ids,
-            activation=activation,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            w1_zp=w1_zp,
-            w2_zp=w2_zp,
-            a1q_scale=a1q_scale,
-            a2_scale=a2_scale,
-            workspace13=workspace13,
-            workspace2=workspace2,
-            expert_num_tokens=expert_num_tokens,
-        )
+        if global_num_experts == -1:
+            global_num_experts = w1.size(0)
+
+        (a1q, a1q_scale, expert_num_tokens, _expert_topk_ids,
+         _expert_topk_weights) = self.prepare_finalize.prepare(
+             a1, a1_scale, a2_scale, topk_weights, topk_ids,
+             global_num_experts, expert_map, apply_router_weight_on_input)
+        # Maybe prepare gathered topk_ids and topk_weights from other EP ranks.
+        topk_ids = topk_ids if _expert_topk_ids is None else _expert_topk_ids
+        topk_weights = (topk_weights if _expert_topk_weights is None else
+                        _expert_topk_weights)
+
+        fused_out = None
+        if a1q.numel() == 0:
+            # This happens when none of the tokens from the all2all reach this
+            # EP rank. Also, note that this is only relevant for CUDAGraph
+            # incompatible all2all kernels like the DeepEP high-throughput
+            # kernels. CUDAGraph compatible all2all kernels like the pplx
+            # kernels and the DeepEP low-latency kernels are always batched
+            # and can never run into the tensor.numel() == 0 case.
+            fused_out = torch.empty_like(a1q).to(dtype=a1.dtype)
+        else:
+            fused_out = self._do_fused_experts(
+                a1=a1,
+                a1q=a1q,
+                w1=w1,
+                w2=w2,
+                topk_ids=topk_ids,
+                expert_num_tokens=expert_num_tokens,
+                activation=activation,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                w1_zp=w1_zp,
+                w2_zp=w2_zp,
+                a1q_scale=a1q_scale,
+                a2_scale=a2_scale)
 
         self.prepare_finalize.finalize(output, fused_out, topk_weights,
                                        topk_ids, apply_router_weight_on_input)
diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
index da7871434..89481e5bd 100644
--- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
+++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@@ -25,7 +25,7 @@ def _moe_permute(
     """
     top_k_num = curr_topk_ids.size(1)
 
-    tokens_in_chunk = curr_hidden_states.sizze(0)
+    tokens_in_chunk = curr_hidden_states.size(0)
 
     sorted_token_ids, expert_ids, num_tokens_post_padded = (
         moe_align_block_size(curr_topk_ids,
@@ -37,11 +37,12 @@ def _moe_permute(
     inv_perm: Optional[torch.Tensor] = None
 
     num_tokens = top_k_num * tokens_in_chunk
-    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
     expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0)
     inv_perm = torch.argsort(sorted_token_ids)[:num_tokens]
 
     # Permute according to sorted token ids.
+    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
+
     curr_hidden_states = _fp8_perm(curr_hidden_states,
                                    sorted_token_ids // top_k_num)
 
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index 8405603cf..1170a16f3 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -32,6 +32,12 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         self.dp_size = dp_size
         self.quant_dtype = quant_dtype
 
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return self.max_num_tokens
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return torch.uint32
+
     def prepare(
         self,
         a1: torch.Tensor,
@@ -42,7 +48,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
         num_tokens = a1.size(0)  # M
         hidden_dim = a1.size(-1)  # K
 
@@ -115,7 +122,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             bound_m=bound_m,
         )
 
-        return expert_x, expert_x_scale, expert_num_tokens
+        return expert_x, expert_x_scale, expert_num_tokens, None, None
 
     def finalize(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index 77a9686c9..9ed95e1de 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -24,6 +24,12 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
         self.block_shape = block_shape
         self.quant_dtype = quant_dtype
 
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return None
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return None
+
     def prepare(
         self,
         a1: torch.Tensor,
@@ -34,7 +40,9 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool = False,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
+
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
             # TODO: this only works for topK=1, will need to update for topK>1
@@ -47,7 +55,7 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
                                                    self.per_channel_quant,
                                                    self.block_shape)
 
-        return a1q, a1q_scale, None
+        return a1q, a1q_scale, None, None, None
 
     def finalize(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index 373e8ab39..920931a93 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -29,9 +29,10 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
                                            per_channel_quant=per_channel_quant,
                                            block_shape=block_shape,
                                            block_m=block_m)
-        self.deep_gemm_expert = DeepGemmExperts()
         self.allow_deep_gemm = allow_deep_gemm
         self.use_fp8_w8a8 = use_fp8_w8a8
+        self.deep_gemm_expert = DeepGemmExperts(
+        ) if self.allow_deep_gemm else None
 
     def workspace_shapes(
         self,
@@ -46,6 +47,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
         if self.allow_deep_gemm and _valid_deep_gemm_shape(M, N, K):
+            assert self.deep_gemm_expert is not None
             return self.deep_gemm_expert.workspace_shapes(
                 a, M, N, K, topk, num_experts)
         else:
@@ -73,7 +75,8 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     ) -> torch.Tensor:
         N = w1.size(1)
         if (self.allow_deep_gemm and self.use_fp8_w8a8 and N > 512
-                and _valid_deep_gemm(hidden_states, w1, w2, expert_map)):
+                and _valid_deep_gemm(hidden_states, w1, w2)):
+            assert self.deep_gemm_expert is not None
             return self.deep_gemm_expert.apply(
                 hidden_states,
                 w1,
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index c3a584782..692482c2e 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -18,8 +18,8 @@ def _resize_cache(x: torch.Tensor, v: tuple[int, ...]) -> torch.Tensor:
     Shrink the given tensor and apply the given view to it.  This is
     used to resize the intermediate fused_moe caches.
     """
-    assert prod(
-        v) <= x.numel(), f"{prod(v)} <= {x.numel()}"  # CUDAGRAPH unfriendly?
+    assert prod(v) <= x.numel(
+    ), f"{v} ({prod(v)}) <= {x.shape} ({x.numel()})"  # CUDAGRAPH unfriendly?
     return x.flatten()[:prod(v)].view(*v)
 
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index cea4d26a4..2438ec30b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -3,7 +3,7 @@
 
 import functools
 import importlib.util
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -452,6 +452,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         if envs.VLLM_USE_DEEP_GEMM:
             if not has_deep_gemm:
                 logger.warning_once("Failed to import DeepGemm kernels.")
+            elif not self.block_quant:
+                logger.warning_once("Model is not block quantized. Not using "
+                                    " DeepGemm kernels")
             elif (current_platform.is_cuda()
                   and current_platform.has_device_capability(90)):
                 logger.info_once("Using DeepGemm kernels for Fp8MoEMethod.")
@@ -460,8 +463,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 logger.warning_once(
                     "DeepGemm not supported on the current platform.")
 
+        self.topk_indices_dtype = None
         self.fused_experts = functools.partial(  # type: ignore
             fused_experts,
+            use_fp8_w8a8=True,
             block_shape=self.quant_config.weight_block_size,
             allow_deep_gemm=self.allow_deep_gemm)
 
@@ -765,18 +770,39 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             del layer.w2_input_scale
 
     def select_gemm_impl(self, prepare_finalize):
+
+        from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+            BatchedTritonExperts)
         from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
             TritonOrDeepGemmExperts)
 
         assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
             "Marlin and ROCm AITER are not supported with all2all yet.")
 
-        experts = TritonOrDeepGemmExperts(
-            use_fp8_w8a8=True,
-            block_shape=self.quant_config.weight_block_size,
-            allow_deep_gemm=self.allow_deep_gemm,
-        )
+        experts: Optional[Union[BatchedTritonExperts,
+                                TritonOrDeepGemmExperts]] = None
+        max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
+        use_batched_experts = max_num_tokens_per_rank is not None
 
+        if use_batched_experts:
+            experts = BatchedTritonExperts(
+                max_num_tokens=max_num_tokens_per_rank,
+                world_size=prepare_finalize.world_size,
+                dp_size=prepare_finalize.dp_size,
+                use_fp8_w8a8=True,
+                use_int8_w8a8=False,
+                use_int8_w8a16=False,
+                use_int4_w4a16=False,
+                block_shape=None,
+            )
+        else:
+            experts = TritonOrDeepGemmExperts(
+                use_fp8_w8a8=True,
+                block_shape=self.quant_config.weight_block_size,
+                allow_deep_gemm=self.allow_deep_gemm,
+            )
+
+        assert experts is not None
         return experts
 
     def apply(
@@ -797,6 +823,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
+
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -808,6 +835,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
         )
 
         if self.rocm_aiter_moe_enabled:
@@ -855,7 +883,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 topk_ids=topk_ids,
                 inplace=True,
                 activation=activation,
-                use_fp8_w8a8=True,
                 global_num_experts=global_num_experts,
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 expert_map=expert_map,
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index e2d9424de..07ae470fa 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -154,6 +154,21 @@ class CudaPlatformBase(Platform):
                 logger.info(
                     "Forcing kv cache block size to 64 for FlashMLA backend.")
 
+        if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
+                and parallel_config.data_parallel_size > 1
+                and vllm_config.compilation_config.use_cudagraph):
+            logger.info(
+                "Data Parallel: Forcing enforce eager to be True since DP "
+                "with DeepEP high-throughput kernels are not CUDA Graph "
+                "compatible. The DeepEP low-latency kernels are CUDA Graph "
+                "compatible. Set the all_to_all backend to deepep_low_latency "
+                "to use those kernels instead.")
+            vllm_config.compilation_config.use_cudagraph = False
+            vllm_config.model_config.enforce_eager = True
+            # TODO (varun): Turning this ON gives incorrect results for the
+            # Deepseek-V2-lite model.
+            vllm_config.compilation_config.use_inductor = False
+
     @classmethod
     def get_current_memory_usage(cls,
                                  device: Optional[torch.types.Device] = None
-- 
GitLab


From bdf13965ab4a528d30cb82854487910189865d9d Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Tue, 3 Jun 2025 13:33:07 -0700
Subject: [PATCH 144/274] [V1] Support cross-layer KV sharing (#18212)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 tests/v1/tpu/worker/test_tpu_model_runner.py  | 227 +++++++++++++++-
 tests/v1/worker/test_gpu_model_runner.py      | 244 +++++++++++++++++-
 vllm/attention/backends/abstract.py           |   1 +
 vllm/attention/backends/blocksparse_attn.py   |   3 +
 vllm/attention/backends/cpu_mla.py            |   3 +-
 .../backends/dual_chunk_flash_attn.py         |   3 +
 vllm/attention/backends/flash_attn.py         |   3 +
 vllm/attention/backends/flashinfer.py         |   3 +
 vllm/attention/backends/flashmla.py           |   3 +-
 vllm/attention/backends/hpu_attn.py           |   3 +
 vllm/attention/backends/ipex_attn.py          |   3 +
 vllm/attention/backends/mla/common.py         |   3 +
 vllm/attention/backends/pallas.py             |   3 +
 vllm/attention/backends/rocm_aiter_mla.py     |   3 +-
 vllm/attention/backends/rocm_flash_attn.py    |   3 +
 vllm/attention/backends/torch_sdpa.py         |   3 +
 vllm/attention/backends/triton_mla.py         |   3 +-
 vllm/attention/backends/xformers.py           |   3 +
 vllm/attention/layer.py                       |  17 +-
 vllm/v1/attention/backends/flash_attn.py      |  36 +--
 vllm/v1/attention/backends/flashinfer.py      |  36 +--
 vllm/v1/attention/backends/mla/common.py      |   4 +
 vllm/v1/attention/backends/mla/flashmla.py    |   3 +-
 .../attention/backends/mla/rocm_aiter_mla.py  |   3 +-
 vllm/v1/attention/backends/mla/triton_mla.py  |   3 +-
 vllm/v1/attention/backends/pallas.py          |   6 +-
 vllm/v1/attention/backends/triton_attn.py     |  51 ++--
 vllm/v1/attention/backends/utils.py           |  33 +++
 vllm/v1/worker/gpu_model_runner.py            |  31 ++-
 vllm/v1/worker/tpu_model_runner.py            |  30 ++-
 vllm/v1/worker/utils.py                       |  36 +++
 31 files changed, 733 insertions(+), 73 deletions(-)

diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 230c97e78..bc54b6ecc 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -4,8 +4,13 @@ import unittest.mock as mock
 
 import pytest
 
-from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
+from vllm.attention.layer import Attention
+from vllm.config import (CacheConfig, ModelConfig, SchedulerConfig, VllmConfig,
+                         set_current_vllm_config)
 from vllm.sampling_params import SamplingParams
+from vllm.utils import GiB_bytes
+from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
+                                         get_kv_cache_config)
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
 from vllm.v1.worker.tpu_model_runner import (
@@ -363,3 +368,223 @@ def test_get_req_paddings():
     assert _get_req_paddings(1, 32) == [8, 16, 32]
     assert _get_req_paddings(8, 32) == [8, 16, 32]
     assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
+
+
+def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    error_msg = f"{layer_1} must come before the current layer"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            # initialization below will fail because target layer is invalid;
+            # the target layer needs to come before layer 1
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+                kv_sharing_target_layer_name=layer_1,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    invalid_layer = "model.layers.0.cross_attn.attn"
+    error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+                # invalid layer: cross_attn.atn doesn't exist!
+                kv_sharing_target_layer_name=invalid_layer,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_with_kv_sharing_target_same_as_current():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    error_msg = f"{layer_1} cannot be the same as the current layer"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            # initialization below will fail because target layer is invalid;
+            # the target layer needs to come before layer 1
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+                kv_sharing_target_layer_name=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_without_kv_sharing(model_runner):
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    vllm_config = model_runner.vllm_config
+    with set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+    # Set high context length to test max context length estimation
+    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+    kv_cache_spec = model_runner.get_kv_cache_spec()
+    assert len(kv_cache_spec) == 2
+    assert len(model_runner.shared_kv_cache_layers) == 0
+
+    available_memory = 20 * GiB_bytes
+    # page size for layer 0's kv_cache_spec is 32KB
+    num_expected_blocks = 327680  # 20GB / 32KB / 2 (num layers)
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                          available_memory)
+    assert kv_cache_config.num_blocks == num_expected_blocks
+    assert len(kv_cache_config.tensors) == 2
+    assert kv_cache_config.tensors[layer_0].size == available_memory // 2
+    assert kv_cache_config.tensors[layer_1].size == available_memory // 2
+
+    max_context_len =\
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    # max context len with KV sharing should be 2x as large as without
+    assert max_context_len == 1310720
+
+    # important: override tensor size to prevent large mem alloc during test
+    # this will only allocate 2 block worth of memory (2 * 32kb)
+    kv_cache_config.num_blocks = 1
+    for layer in kv_cache_config.tensors:
+        kv_cache_config.tensors[layer].size =\
+            kv_cache_spec[layer].page_size_bytes
+
+    model_runner.initialize_kv_cache(kv_cache_config)
+
+    layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
+    layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
+    # check layer 1 kv cache does NOT share memory with layer 0
+    assert id(layer_1_kv) != id(layer_0_kv)
+
+    # check layer 1 added to kv cache group's layer names
+    assert len(kv_cache_config.kv_cache_groups) == 1
+    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
+
+
+def test_init_kv_cache_with_kv_sharing_valid(model_runner):
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    vllm_config = model_runner.vllm_config
+    with set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+                kv_sharing_target_layer_name="model.layers.0.self_attn.attn",
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+    # Set high context length to test max context length estimation
+    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+    kv_cache_spec = model_runner.get_kv_cache_spec()
+    assert len(kv_cache_spec) == 1
+    assert layer_0 in kv_cache_spec
+    assert model_runner.shared_kv_cache_layers[layer_1] == layer_0
+
+    available_memory = 20 * GiB_bytes
+    # page size for layer 0's kv_cache_spec is 32KB
+    # with KV sharing, we can allocate (available_mem//page_size//1) blocks
+    # which is twice as many as without KV sharing
+    num_expected_blocks = 655360  # 20GB / 32KB
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                          available_memory)
+    assert kv_cache_config.num_blocks == num_expected_blocks
+    assert len(kv_cache_config.tensors) == 1
+    # Each layer now has twice the available memory for KV cache
+    # compared to no KV sharing
+    assert kv_cache_config.tensors[layer_0].size == available_memory
+
+    max_context_len =\
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    # max context len with KV sharing should be 2x as large as without
+    assert max_context_len == 2 * 1310720
+
+    # important: override tensor size to prevent large mem alloc during test
+    # this will only allocate 1 block worth of memory (32kb)
+    kv_cache_config.num_blocks = 1
+    kv_cache_config.tensors[layer_0].size =\
+        kv_cache_spec[layer_0].page_size_bytes
+
+    model_runner.initialize_kv_cache(kv_cache_config)
+
+    layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
+    layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
+    # check layer 1 kv cache shares memory with layer 0
+    assert id(layer_1_kv) == id(layer_0_kv)
+
+    # check layer 1 added to kv cache group's layer names
+    assert len(kv_cache_config.kv_cache_groups) == 1
+    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index ceb9d4df2..5e2fd2fbf 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -7,8 +7,11 @@ import pytest
 
 from vllm.attention import Attention
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, VllmConfig)
+                         SchedulerConfig, VllmConfig, set_current_vllm_config)
 from vllm.sampling_params import SamplingParams
+from vllm.utils import GiB_bytes
+from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
+                                         get_kv_cache_config)
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -19,6 +22,7 @@ from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
 BLOCK_SIZE = 16
 NUM_BLOCKS = 10
+DEVICE = "cuda"
 
 
 def initialize_kv_cache(runner: GPUModelRunner):
@@ -55,8 +59,7 @@ def initialize_kv_cache(runner: GPUModelRunner):
     runner.initialize_attn_backend(kv_cache_config)
 
 
-@pytest.fixture
-def model_runner():
+def get_vllm_config():
     scheduler_config = SchedulerConfig(
         max_num_seqs=10,
         max_num_batched_tokens=512,
@@ -84,13 +87,18 @@ def model_runner():
         scheduler_config=scheduler_config,
         parallel_config=parallel_config,
     )
-    num_heads = model_config.get_num_kv_heads(parallel_config)
+    return vllm_config
+
+
+@pytest.fixture
+def model_runner():
+    vllm_config = get_vllm_config()
+    model_config = vllm_config.model_config
+    num_heads = model_config.get_num_kv_heads(vllm_config.parallel_config)
     head_size = model_config.get_head_size()
     vllm_config.compilation_config.static_forward_context[
         "layer.0"] = Attention(num_heads, head_size, 0.1)
-
-    device = "cuda"
-    runner = GPUModelRunner(vllm_config, device)
+    runner = GPUModelRunner(vllm_config, DEVICE)
     initialize_kv_cache(runner)
     return runner
 
@@ -385,3 +393,225 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
     model_runner_2.load_model()  # Load real weights inplace
     assert str(model_runner.get_model().state_dict()) == str(
         model_runner_2.get_model().state_dict())
+
+
+def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    error_msg = f"{layer_1} must come before the current layer"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            # initialization below will fail because target layer is invalid;
+            # the target layer needs to come before layer 1
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+                kv_sharing_target_layer_name=layer_1,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    invalid_layer = "model.layers.0.cross_attn.attn"
+    error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+                # invalid layer: cross_attn.atn doesn't exist!
+                kv_sharing_target_layer_name=invalid_layer,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_with_kv_sharing_target_same_as_current():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    error_msg = f"{layer_1} cannot be the same as the current layer"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            # initialization below will fail because target layer is invalid;
+            # the target layer needs to come before layer 1
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+                kv_sharing_target_layer_name=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_without_kv_sharing():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    vllm_config = get_vllm_config()
+    with set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+    # Set high context length to test max context length estimation
+    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+    runner = GPUModelRunner(vllm_config, DEVICE)
+    kv_cache_spec = runner.get_kv_cache_spec()
+    assert len(kv_cache_spec) == 2
+    assert len(runner.shared_kv_cache_layers) == 0
+
+    available_memory = 20 * GiB_bytes
+    # page size for layer 0's kv_cache_spec is 32KB
+    num_expected_blocks = 327680  # 20GB / 32KB / 2 (num layers)
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                          available_memory)
+    assert kv_cache_config.num_blocks == num_expected_blocks
+    assert len(kv_cache_config.tensors) == 2
+    assert kv_cache_config.tensors[layer_0].size == available_memory // 2
+    assert kv_cache_config.tensors[layer_1].size == available_memory // 2
+
+    max_context_len =\
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    # max context len with KV sharing should be 2x as large as without
+    assert max_context_len == 1310720
+
+    # important: override tensor size to prevent large mem alloc during test
+    # this will only allocate 2 block worth of memory (2 * 32kb)
+    kv_cache_config.num_blocks = 1
+    for layer in kv_cache_config.tensors:
+        kv_cache_config.tensors[layer].size =\
+            kv_cache_spec[layer].page_size_bytes
+
+    runner.initialize_kv_cache(kv_cache_config)
+
+    layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
+    layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
+    # check layer 1 kv cache does NOT share memory with layer 0
+    assert id(layer_1_kv) != id(layer_0_kv)
+
+    # check layer 1 added to kv cache group's layer names
+    assert len(kv_cache_config.kv_cache_groups) == 1
+    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
+
+
+def test_init_kv_cache_with_kv_sharing_valid():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    vllm_config = get_vllm_config()
+    with set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+                kv_sharing_target_layer_name="model.layers.0.self_attn.attn",
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+    # Set high context length to test max context length estimation
+    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+    runner = GPUModelRunner(vllm_config, DEVICE)
+    kv_cache_spec = runner.get_kv_cache_spec()
+    assert len(kv_cache_spec) == 1
+    assert layer_0 in kv_cache_spec
+    assert runner.shared_kv_cache_layers[layer_1] == layer_0
+
+    available_memory = 20 * GiB_bytes
+    # page size for layer 0's kv_cache_spec is 32KB
+    # with KV sharing, we can allocate (available_mem//page_size//1) blocks
+    # which is twice as many as without KV sharing
+    num_expected_blocks = 655360  # 20GB / 32KB
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                          available_memory)
+    assert kv_cache_config.num_blocks == num_expected_blocks
+    assert len(kv_cache_config.tensors) == 1
+    # Each layer now has twice the available memory for KV cache
+    # compared to no KV sharing
+    assert kv_cache_config.tensors[layer_0].size == available_memory
+
+    max_context_len =\
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    # max context len with KV sharing should be 2x as large as without
+    assert max_context_len == 2 * 1310720
+
+    # important: override tensor size to prevent large mem alloc during test
+    # this will only allocate 1 block worth of memory (32kb)
+    kv_cache_config.num_blocks = 1
+    kv_cache_config.tensors[layer_0].size =\
+        kv_cache_spec[layer_0].page_size_bytes
+
+    runner.initialize_kv_cache(kv_cache_config)
+
+    layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
+    layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
+    # check layer 1 kv cache shares memory with layer 0
+    assert id(layer_1_kv) == id(layer_0_kv)
+
+    # check layer 1 added to kv cache group's layer names
+    assert len(kv_cache_config.kv_cache_groups) == 1
+    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index deb3951d6..0ba5a5bf9 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -270,6 +270,7 @@ class AttentionImpl(ABC, Generic[T]):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
     ) -> None:
         raise NotImplementedError
 
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index a2fd557f8..c1663516d 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -306,7 +306,10 @@ class BlocksparseFlashAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         assert blocksparse_params is not None
         assert alibi_slopes is None, ValueError(
             "Alibi not support for blocksparse flash attention.")
diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py
index 39e667bca..cf7883e12 100644
--- a/vllm/attention/backends/cpu_mla.py
+++ b/vllm/attention/backends/cpu_mla.py
@@ -206,12 +206,13 @@ class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]):
             blocksparse_params: Optional[Dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         unsupported_features = [
             alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py
index 3548df88d..963bccdf2 100644
--- a/vllm/attention/backends/dual_chunk_flash_attn.py
+++ b/vllm/attention/backends/dual_chunk_flash_attn.py
@@ -290,9 +290,12 @@ class DualChunkFlashAttentionImpl(FlashAttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         layer_idx: int = -1,
         dual_chunk_attention_config: Optional[Dict[str, Any]] = None,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 26be2c04f..73e377268 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -618,8 +618,11 @@ class FlashAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if blocksparse_params is not None:
             raise ValueError(
                 "FlashAttention does not support block-sparse attention.")
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 7ae7ea37f..a3937760f 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -936,8 +936,11 @@ class FlashInferImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if use_irope:
             logger.warning_once(
                 "Using irope in FlashInfer is not supported yet, it will fall"
diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py
index 9a6b8a40e..e185d0260 100644
--- a/vllm/attention/backends/flashmla.py
+++ b/vllm/attention/backends/flashmla.py
@@ -184,12 +184,13 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
             blocksparse_params: Optional[Dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str] = None,
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         assert is_flashmla_supported(), \
             "FlashMLA is not supported on this device"
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 5128e4975..9bd513fd8 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -110,9 +110,12 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         max_seq_len: int = 4096,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
         super(AttentionImpl, self).__init__()
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if use_irope:
             logger.warning_once(
                 "Using irope in HPU is not supported yet, it will fall back "
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 30441b3ad..5051c6a7c 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -123,8 +123,11 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if use_irope:
             logger.warning_once(
                 "Using irope in Ipex is not supported yet, it will fall"
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 50842abd3..78cf95288 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -1000,6 +1000,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
         blocksparse_params: Optional[Dict[str, Any]],
         logits_soft_cap: Optional[float],
         attn_type: str,
+        kv_sharing_target_layer_name: Optional[str],
         # MLA Specific Arguments
         q_lora_rank: Optional[int],
         kv_lora_rank: int,
@@ -1009,6 +1010,8 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
         v_head_dim: int,
         kv_b_proj: ColumnParallelLinear,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing not supported in V0.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index a6823ac05..7ad67615d 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -109,8 +109,11 @@ class PallasAttentionBackendImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if use_irope:
             logger.warning_once(
                 "Using irope in Pallas is not supported yet, it will fall back "
diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py
index 855036071..1edf34351 100644
--- a/vllm/attention/backends/rocm_aiter_mla.py
+++ b/vllm/attention/backends/rocm_aiter_mla.py
@@ -370,12 +370,13 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
             blocksparse_params: Optional[dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         unsupported_features = [
             alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 755e0da06..4b460dc0b 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -494,8 +494,11 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if use_irope:
             logger.warning_once(
                 "Using irope in ROCm Flash Attention is not supported yet, it "
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 760634004..f3fb5adcf 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -405,8 +405,11 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if blocksparse_params is not None:
             raise ValueError(
                 "Torch SPDA does not support block-sparse attention.")
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
index d9fff8fac..e06f7d54e 100644
--- a/vllm/attention/backends/triton_mla.py
+++ b/vllm/attention/backends/triton_mla.py
@@ -38,12 +38,13 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
             blocksparse_params: Optional[Dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         unsupported_features = [
             alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 8355e0397..04ef928b7 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -390,8 +390,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if blocksparse_params is not None:
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 6c5b05a5c..a5fbd1a1c 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -21,6 +21,7 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import _Backend, current_platform
 from vllm.utils import direct_register_custom_op
+from vllm.v1.attention.backends.utils import validate_kv_sharing_target
 
 
 class Attention(nn.Module):
@@ -50,6 +51,7 @@ class Attention(nn.Module):
         use_mla: bool = False,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         **extra_impl_args,
     ) -> None:
         """
@@ -135,7 +137,7 @@ class Attention(nn.Module):
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
                              blocksparse_params, logits_soft_cap, attn_type,
-                             **extra_impl_args)
+                             kv_sharing_target_layer_name, **extra_impl_args)
         self.backend = backend_name_to_enum(attn_backend.get_name())
         self.dtype = dtype
 
@@ -153,6 +155,19 @@ class Attention(nn.Module):
         compilation_config.static_forward_context[prefix] = self
         self.layer_name = prefix
         self.attn_type = attn_type
+
+        if kv_sharing_target_layer_name is not None:
+            if not envs.VLLM_USE_V1:
+                raise NotImplementedError(
+                    "Cross-layer KV sharing is not supported in V0.")
+
+            validate_kv_sharing_target(
+                prefix,
+                kv_sharing_target_layer_name,
+                compilation_config.static_forward_context,
+            )
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
+
         # use a placeholder kv cache tensor during init, which will be replaced
         # by bind_kv_cache
         # this variable will not be accessed if use_direct_call is True
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 9e989df1c..a92c51883 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -485,6 +485,7 @@ class FlashAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
@@ -506,6 +507,7 @@ class FlashAttentionImpl(AttentionImpl):
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             logits_soft_cap = 0
         self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -569,22 +571,26 @@ class FlashAttentionImpl(AttentionImpl):
         # performance to make sure it does not introduce any overhead.
 
         num_actual_tokens = attn_metadata.num_actual_tokens
-        # Reshape the input keys and values and store them in the cache.
-        # NOTE(woosuk): Here, key and value are padded while slot_mapping is
-        # not padded. However, we don't need to do key[:num_actual_tokens] and
-        # value[:num_actual_tokens] because the reshape_and_cache_flash op uses
-        # the slot_mapping's shape to determine the number of actual tokens.
         key_cache, value_cache = kv_cache.unbind(0)
-        torch.ops._C_cache_ops.reshape_and_cache_flash(
-            key,
-            value,
-            key_cache,
-            value_cache,
-            attn_metadata.slot_mapping,
-            self.kv_cache_dtype,
-            layer._k_scale,
-            layer._v_scale,
-        )
+
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+            # not padded. However, we don't need to do key[:num_actual_tokens]
+            # and value[:num_actual_tokens] because the reshape_and_cache_flash
+            # op uses the slot_mapping's shape to determine the number of
+            # actual tokens.
+            torch.ops._C_cache_ops.reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
 
         if self.kv_cache_dtype.startswith("fp8"):
             key_cache = key_cache.view(torch.float8_e4m3fn)
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 8bd998eba..f1b61c152 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -507,6 +507,7 @@ class FlashInferImpl(AttentionImpl):
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[int] = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -521,6 +522,7 @@ class FlashInferImpl(AttentionImpl):
             self.sliding_window = (sliding_window - 1, 0)
         self.kv_cache_dtype = kv_cache_dtype
         self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -568,21 +570,25 @@ class FlashInferImpl(AttentionImpl):
         # performance to make sure it does not introduce any overhead.
 
         num_actual_tokens = attn_metadata.num_actual_tokens
-        # Reshape the input keys and values and store them in the cache.
-        # NOTE(woosuk): Here, key and value are padded while slot_mapping is
-        # not padded. However, we don't need to do key[:num_actual_tokens] and
-        # value[:num_actual_tokens] because the reshape_and_cache_flash op uses
-        # the slot_mapping's shape to determine the number of actual tokens.
-        torch.ops._C_cache_ops.reshape_and_cache_flash(
-            key,
-            value,
-            kv_cache[:, 0],
-            kv_cache[:, 1],
-            attn_metadata.slot_mapping,
-            self.kv_cache_dtype,
-            layer._k_scale,
-            layer._v_scale,
-        )
+
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+            # not padded. However, we don't need to do key[:num_actual_tokens]
+            # and value[:num_actual_tokens] because the reshape_and_cache_flash
+            # op uses the slot_mapping's shape to determine the number of
+            # actual tokens.
+            torch.ops._C_cache_ops.reshape_and_cache_flash(
+                key,
+                value,
+                kv_cache[:, 0],
+                kv_cache[:, 1],
+                attn_metadata.slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
 
         window_left = (self.sliding_window[0]
                        if self.sliding_window is not None else -1)
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 96befca5a..06acbb909 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -586,6 +586,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         blocksparse_params: Optional[dict[str, Any]],
         logits_soft_cap: Optional[float],
         attn_type: str,
+        kv_sharing_target_layer_name: Optional[str],
         # MLA Specific Arguments
         q_lora_rank: Optional[int],
         kv_lora_rank: int,
@@ -595,6 +596,9 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         v_head_dim: int,
         kv_b_proj: ColumnParallelLinear,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported for MLA")
+
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 060a7c9d8..318b8ede1 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -93,12 +93,13 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
             blocksparse_params: Optional[dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         assert is_flashmla_supported(), \
             "FlashMLA is not supported on this device"
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 8925b5a5c..1f0406a7a 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -139,12 +139,13 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
             blocksparse_params: Optional[dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
         assert (num_heads == 16 or num_heads == 128), (
             f"Aiter MLA only supports 16 or 128 number of heads.\n"
             f"Provided {num_heads} number of heads.\n"
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index 0857fc133..e26d79091 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -41,12 +41,13 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
             blocksparse_params: Optional[dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         unsupported_features = [
             alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 896f1394c..0f956ba88 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -113,6 +113,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[int] = None,
         use_irope: bool = False,
     ) -> None:
         if use_irope:
@@ -128,6 +129,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
         self.num_kv_heads = num_kv_heads
         self.sliding_window = sliding_window
         self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -181,7 +183,9 @@ class PallasAttentionBackendImpl(AttentionImpl):
         num_tokens, hidden_size = query.shape
         query = query.view(num_tokens, self.num_heads, self.head_size)
 
-        if kv_cache.numel() > 0:
+        if self.kv_sharing_target_layer_name is None and kv_cache.numel() > 0:
+            # Write input keys and values to the KV cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
             slot_mapping = attn_metadata.slot_mapping
             write_to_kv_cache(key, value, kv_cache, slot_mapping)
 
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 6a3314dd8..968f13701 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -88,6 +88,7 @@ class TritonAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[int] = None,
         use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
@@ -109,6 +110,7 @@ class TritonAttentionImpl(AttentionImpl):
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             logits_soft_cap = 0
         self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
 
         self.use_irope = use_irope
 
@@ -178,31 +180,34 @@ class TritonAttentionImpl(AttentionImpl):
         if use_prefill_decode_attn:
             key_cache, value_cache = PagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
-
-            # Reshape the input keys and values and store them in the cache.
-            PagedAttention.write_to_paged_cache(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
-
         else:
             key_cache, value_cache = kv_cache.unbind(0)
-            torch.ops._C_cache_ops.reshape_and_cache_flash(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
+
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            if use_prefill_decode_attn:
+                PagedAttention.write_to_paged_cache(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.slot_mapping,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
+            else:
+                torch.ops._C_cache_ops.reshape_and_cache_flash(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.slot_mapping,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
 
         if self.kv_cache_dtype.startswith("fp8"):
             key_cache = key_cache.view(self.fp8_dtype)
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 2e65619ed..72c764353 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -17,3 +17,36 @@ class CommonAttentionMetadata:
     seq_lens: torch.Tensor
     """(batch_size,), the length of each request including both computed tokens
     and newly scheduled tokens"""
+
+
+def validate_kv_sharing_target(current_layer_name, target_layer_name,
+                               static_forward_context):
+    error_msg = (f"Specified KV sharing target layer for {current_layer_name} "
+                 f"is not valid: target layer {target_layer_name} ")
+
+    if current_layer_name == target_layer_name:
+        raise ValueError(error_msg +
+                         "cannot be the same as the current layer.")
+
+    if target_layer_name not in static_forward_context:
+        from vllm.model_executor.models.utils import extract_layer_index
+
+        # If target layer name is not in the static fwd context, it means either
+        # a) the target layer does not come BEFORE the current layer, or
+        # b) the target layer is not an Attention layer that exists in the model
+        current_layer_idx = extract_layer_index(current_layer_name)
+        target_layer_idx = extract_layer_index(target_layer_name)
+        if current_layer_idx <= target_layer_idx:
+            raise ValueError(error_msg + "must come before the current layer.")
+        else:
+            raise ValueError(error_msg +
+                             "is not a valid Attention layer in the model.")
+
+    # Currently KV sharing is only supported between layers of the same type
+    target_layer_attn_type = static_forward_context[
+        target_layer_name].attn_type
+    expected = static_forward_context[current_layer_name].attn_type
+    if target_layer_attn_type != expected:
+        raise ValueError(
+            error_msg +
+            f"must be the same type as the current layer ({expected}).")
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c96ad0c01..b7448be26 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -59,8 +59,8 @@ from vllm.v1.worker.block_table import BlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
-                    scatter_mm_placeholders)
+from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
+                    sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 
 if TYPE_CHECKING:
     import xgrammar as xgr
@@ -276,6 +276,12 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                                         pin_memory=self.pin_memory)
         self.seq_lens_np = self.seq_lens_cpu.numpy()
 
+        # Layer pairings for cross-layer KV sharing.
+        # If an Attention layer `layer_name` is in the keys of this dict, it
+        # means this layer will perform attention using the keys and values
+        # from the KV cache of `shared_kv_cache_layers[layer_name]`.
+        self.shared_kv_cache_layers: dict[str, str] = {}
+
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
         """
         Update the order of requests in the batch based on the attention
@@ -2097,6 +2103,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                     # KV cache specs.
                     raise ValueError("Unknown KV cache spec type.")
 
+        # Setup `kv_cache_config` and `kv_caches` for models
+        # with cross-layer KV sharing
+        if self.shared_kv_cache_layers:
+            initialize_kv_cache_for_kv_sharing(
+                self.shared_kv_cache_layers,
+                kv_cache_config.kv_cache_groups,
+                kv_caches,
+            )
+
         if self.speculative_config and self.speculative_config.use_eagle():
             assert isinstance(self.drafter, EagleProposer)
             # validate all draft model layers belong to the same kv cache
@@ -2125,6 +2140,18 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         use_mla = self.vllm_config.model_config.use_mla
         kv_cache_spec: dict[str, KVCacheSpec] = {}
         for layer_name, attn_module in layers.items():
+            if (kv_tgt_layer :=
+                    attn_module.kv_sharing_target_layer_name) is not None:
+                # The layer doesn't need its own KV cache and will use that of
+                # the target layer. We skip creating a KVCacheSpec for it, so
+                # that KV cache management logic will act as this layer does
+                # not exist, and doesn't allocate KV cache for the layer. This
+                # enables the memory saving of cross-layer kv sharing, allowing
+                # a given amount of memory to accommodate longer context lengths
+                # or enable more requests to be processed simultaneously.
+                self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
+                continue
+
             # TODO: Support other attention modules, e.g., cross-attention
             if attn_module.attn_type == AttentionType.DECODER:
                 if attn_module.sliding_window is not None:
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 48ea3cb7b..f15234f49 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -44,7 +44,8 @@ from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import sanity_check_mm_encoder_outputs
+from .utils import (initialize_kv_cache_for_kv_sharing,
+                    sanity_check_mm_encoder_outputs)
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -238,6 +239,12 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         self.num_reqs_paddings = _get_req_paddings(
             min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)
 
+        # Layer pairings for cross-layer KV sharing.
+        # If an Attention layer `layer_name` is in the keys of this dict, it
+        # means this layer will perform attention using the keys and values
+        # from the KV cache of `shared_kv_cache_layers[layer_name]`.
+        self.shared_kv_cache_layers: dict[str, str] = {}
+
         # tensors for structured decoding
         self.grammar_bitmask_cpu = torch.zeros(
             (self.max_num_reqs, cdiv(self.vocab_size, 32)),
@@ -455,6 +462,18 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         block_size = self.vllm_config.cache_config.block_size
         kv_cache_spec: dict[str, KVCacheSpec] = {}
         for layer_name, attn_module in layers.items():
+            if (kv_tgt_layer :=
+                    attn_module.kv_sharing_target_layer_name) is not None:
+                # The layer doesn't need its own KV cache and will use that of
+                # the target layer. We skip creating a KVCacheSpec for it, so
+                # that KV cache management logic will act as this layer does
+                # not exist, and doesn't allocate KV cache for the layer. This
+                # enables the memory saving of cross-layer kv sharing, allowing
+                # a given amount of memory to accommodate longer context lengths
+                # or enable more requests to be processed simultaneously.
+                self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
+                continue
+
             if attn_module.attn_type == AttentionType.DECODER:
                 if attn_module.sliding_window is not None:
                     kv_cache_spec[layer_name] = SlidingWindowSpec(
@@ -1376,6 +1395,15 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                 else:
                     raise NotImplementedError
 
+        # Setup `kv_cache_config` and `kv_caches` for models
+        # with cross-layer KV sharing
+        if self.shared_kv_cache_layers:
+            initialize_kv_cache_for_kv_sharing(
+                self.shared_kv_cache_layers,
+                kv_cache_config.kv_cache_groups,
+                kv_caches,
+            )
+
         bind_kv_cache(
             kv_caches,
             self.vllm_config.compilation_config.static_forward_context,
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index b23b28c1d..055cf0153 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -4,6 +4,8 @@ from typing import Optional
 
 import torch
 
+from vllm.v1.kv_cache_interface import KVCacheGroupSpec
+
 
 def sanity_check_mm_encoder_outputs(
     mm_embeddings: object,
@@ -73,3 +75,37 @@ def gather_mm_placeholders(
         return placeholders
 
     return placeholders[is_embed]
+
+
+def initialize_kv_cache_for_kv_sharing(
+    shared_kv_cache_layers: dict[str, str],
+    kv_cache_groups: list[KVCacheGroupSpec],
+    kv_caches: dict[str, torch.Tensor],
+) -> None:
+    """
+    Sets up KV cache sharing by reusing the allocated KV caches in `kv_caches`
+    for layers that do not allocate its own KV cache, based on the mapping in
+    `shared_kv_cache_layers`. Adds these layers to the corresponding KV cache
+    group, which is needed to ensure that attention metadata is assigned later.
+
+    Args:
+        shared_kv_cache_layers: Layer pairings for cross-layer KV sharing.
+            If an Attention layer `layer_name` is in the keys of this dict, it
+            means this layer will perform attention using the keys and values
+            from the KV cache of `shared_kv_cache_layers[layer_name]`.
+        kv_cache_groups: The KV cache groups of the model.
+        kv_caches: The allocated kv_caches with layer names as keys.
+            Note that layers in shared_kv_cache_layers.keys() are not
+            originally included as it only contains layers which have its own
+            KV cache allocation.
+    """
+    # Record index of KV cache group for each layer that allocates a KV cache.
+    layer_to_kv_cache_group_idx: dict[str, int] = {}
+    for i, kv_cache_group in enumerate(kv_cache_groups):
+        for layer_name in kv_cache_group.layer_names:
+            layer_to_kv_cache_group_idx[layer_name] = i
+
+    for layer_name, target_layer_name in shared_kv_cache_layers.items():
+        kv_caches[layer_name] = kv_caches[target_layer_name]
+        group_idx = layer_to_kv_cache_group_idx[target_layer_name]
+        kv_cache_groups[group_idx].layer_names.append(layer_name)
-- 
GitLab


From e31446b6c8d887cdca031abf8527555adee46058 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 3 Jun 2025 16:48:25 -0400
Subject: [PATCH 145/274] [Perf] Tune `scaled_fp8_quant` by increasing
 vectorization (#18844)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 csrc/quantization/fp8/common.cu               | 35 ++++---
 csrc/quantization/fp8/common.cuh              | 68 ++++++-------
 .../fused_kernels/layernorm_utils.cuh         | 99 ++++++++++---------
 csrc/quantization/vectorization.cuh           | 23 +++--
 4 files changed, 115 insertions(+), 110 deletions(-)

diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
index eceb3a8ea..f3f9f669e 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -39,8 +39,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
   fp8_type* __restrict__ token_output = &out[offset];
 
   // For vectorization, token_input and token_output pointers need to be
-  // aligned at 8-byte and 4-byte addresses respectively.
-  bool const can_vectorize = hidden_size % 4 == 0;
+  // aligned at 32-byte and 16-byte addresses respectively.
+  bool const can_vectorize = hidden_size % 16 == 0;
 
   float absmax_val = 0.0f;
   if (can_vectorize) {
@@ -48,24 +48,24 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
   } else {
     for (int i = tid; i < hidden_size; i += blockDim.x) {
       float const x = static_cast<float>(token_input[i]);
-      absmax_val = max(absmax_val, fabs(x));
+      absmax_val = fmaxf(absmax_val, fabsf(x));
     }
   }
 
-  using BlockReduce = cub::BlockReduce<float, 1024>;
+  using BlockReduce = cub::BlockReduce<float, 256>;
   __shared__ typename BlockReduce::TempStorage reduceStorage;
   float const block_absmax_val_maybe =
       BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
   __shared__ float token_scale;
   if (tid == 0) {
     if (scale_ub) {
-      token_scale = min(block_absmax_val_maybe, *scale_ub);
+      token_scale = fminf(block_absmax_val_maybe, *scale_ub);
     } else {
       token_scale = block_absmax_val_maybe;
     }
     // token scale computation
-    token_scale = max(token_scale / quant_type_max_v<fp8_type>,
-                      min_scaling_factor<fp8_type>::val());
+    token_scale = fmaxf(token_scale / quant_type_max_v<fp8_type>,
+                        min_scaling_factor<fp8_type>::val());
     scale[token_idx] = token_scale;
   }
   __syncthreads();
@@ -88,10 +88,11 @@ void static_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
                              torch::Tensor const& input,  // [..., d]
                              torch::Tensor const& scale)  // [1]
 {
-  int64_t num_tokens = input.numel() / input.size(-1);
-  int64_t num_elems = input.numel();
-  dim3 grid(num_tokens);
-  dim3 block(1024);
+  int const block_size = 256;
+  int const num_tokens = input.numel() / input.size(-1);
+  int const num_elems = input.numel();
+  dim3 const grid(num_tokens);
+  dim3 const block(block_size);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
@@ -110,10 +111,11 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
                               torch::Tensor const& input,  // [..., d]
                               torch::Tensor& scale)        // [1]
 {
-  int64_t num_tokens = input.numel() / input.size(-1);
-  int64_t num_elems = input.numel();
-  dim3 grid(num_tokens);
-  dim3 block(1024);
+  int const block_size = 256;
+  int const num_tokens = input.numel() / input.size(-1);
+  int const num_elems = input.numel();
+  dim3 const grid(num_tokens);
+  dim3 const block(block_size);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
@@ -141,8 +143,9 @@ void dynamic_per_token_scaled_fp8_quant(
 
   int const hidden_size = input.size(-1);
   int const num_tokens = input.numel() / hidden_size;
+  int const block_size = 256;
   dim3 const grid(num_tokens);
-  dim3 const block(std::min(hidden_size, 1024));
+  dim3 const block(std::min(hidden_size, block_size));
 
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh
index def8b31b2..d36f94a8f 100644
--- a/csrc/quantization/fp8/common.cuh
+++ b/csrc/quantization/fp8/common.cuh
@@ -46,7 +46,7 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
   }
 
   float r =
-      fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
+      fmaxf(-quant_type_max_v<fp8_type>, fminf(x, quant_type_max_v<fp8_type>));
 #ifndef USE_ROCM
   return static_cast<fp8_type>(r);
 #else
@@ -65,7 +65,7 @@ template <typename scalar_t, typename fp8_type>
 __global__ void segmented_max_reduction(float* __restrict__ scale,
                                         const scalar_t* __restrict__ input,
                                         int64_t num_elems) {
-  __shared__ float cache[1024];
+  __shared__ float cache[256];
   int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
 
   // First store maximum for all values processes by
@@ -73,7 +73,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
   scalar_t tmp = 0.0;
   while (i < num_elems) {
     float x = static_cast<float>(input[i]);
-    tmp = max(tmp, fabs(x));
+    tmp = fmaxf(tmp, fabsf(x));
     i += blockDim.x * gridDim.x;
   }
   cache[threadIdx.x] = tmp;
@@ -100,25 +100,27 @@ template <typename scalar_t>
 __device__ float thread_max_vec(scalar_t const* __restrict__ input,
                                 int64_t const num_elems, int const tid,
                                 int const step) {
+  constexpr size_t VEC_SIZE = 16;
+  using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
   // Vectorized input/output to better utilize memory bandwidth.
-  vec4_t<scalar_t> const* vectorized_in =
-      reinterpret_cast<vec4_t<scalar_t> const*>(input);
+  auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
 
-  int64_t const num_vec_elems = num_elems >> 2;
+  // num_elems / VEC_SIZE (which is 16)
+  int64_t const num_vec_elems = num_elems >> 4;
   float absmax_val = 0.0f;
 
-#pragma unroll 4
+#pragma unroll
   for (int64_t i = tid; i < num_vec_elems; i += step) {
-    vec4_t<scalar_t> in_vec = vectorized_in[i];
-    absmax_val = max(absmax_val, fabs(in_vec.x));
-    absmax_val = max(absmax_val, fabs(in_vec.y));
-    absmax_val = max(absmax_val, fabs(in_vec.z));
-    absmax_val = max(absmax_val, fabs(in_vec.w));
+    scalarxN_t in_vec = vectorized_in[i];
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      absmax_val = fmaxf(absmax_val, fabsf(in_vec.val[j]));
+    }
   }
 
-  // Handle the remaining elements if num_elems is not divisible by 4
-  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
-    absmax_val = max(absmax_val, fabs(input[i]));
+  // Handle the remaining elements if num_elems is not divisible by VEC_SIZE
+  for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
+    absmax_val = fmaxf(absmax_val, fabsf(input[i]));
   }
 
   return absmax_val;
@@ -130,31 +132,31 @@ __device__ void scaled_fp8_conversion_vec(fp8_type* __restrict__ out,
                                           float const scale,
                                           int64_t const num_elems,
                                           int const tid, int const step) {
-  using float8x4_t = q8x4_t<fp8_type>;
+  constexpr size_t VEC_SIZE = 16;
+  using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
+  using float8xN_t = q8_n_t<fp8_type, VEC_SIZE>;
   // Vectorized input/output to better utilize memory bandwidth.
-  auto const* vectorized_in = reinterpret_cast<vec4_t<scalar_t> const*>(input);
-  auto* vectorized_out = reinterpret_cast<float8x4_t*>(out);
+  auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
+  auto* vectorized_out = reinterpret_cast<float8xN_t*>(out);
 
-  int64_t const num_vec_elems = num_elems >> 2;
+  // num_elems / VEC_SIZE (which is 16)
+  int64_t const num_vec_elems = num_elems >> 4;
 
-#pragma unroll 4
+#pragma unroll
   for (int64_t i = tid; i < num_vec_elems; i += step) {
-    vec4_t<scalar_t> in_vec = vectorized_in[i];
-    float8x4_t out_vec;
-
-    out_vec.x = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
-        static_cast<float>(in_vec.x), scale);
-    out_vec.y = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
-        static_cast<float>(in_vec.y), scale);
-    out_vec.z = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
-        static_cast<float>(in_vec.z), scale);
-    out_vec.w = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
-        static_cast<float>(in_vec.w), scale);
+    scalarxN_t in_vec = vectorized_in[i];
+    float8xN_t out_vec;
+
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      out_vec.val[j] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
+          static_cast<float>(in_vec.val[j]), scale);
+    }
     vectorized_out[i] = out_vec;
   }
 
-  // Handle the remaining elements if num_elems is not divisible by 4
-  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
+  // Handle the remaining elements if num_elems is not divisible by VEC_SIZE
+  for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
     out[i] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
         static_cast<float>(input[i]), scale);
   }
diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
index e6d23cd24..3f188872d 100644
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -140,6 +140,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
   // sum of squares
   float ss = 0.0f;
 
+  const int VEC_SIZE = 4;
   int32_t const num_vec_elems = hidden_size >> 2;
 
 #pragma unroll 4
@@ -147,22 +148,23 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
     vec4_t<scalar_t> in = vec_input[i];
 
     vec4_t<float> x;
-    x.x = static_cast<float>(in.x);
-    x.y = static_cast<float>(in.y);
-    x.z = static_cast<float>(in.z);
-    x.w = static_cast<float>(in.w);
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      x.val[j] = static_cast<float>(in.val[j]);
+    }
+
     if constexpr (has_residual) {
       vec4_t<scalar_t> r = vec_residual[i];
-      x.x += static_cast<float>(r.x);
-      x.y += static_cast<float>(r.y);
-      x.z += static_cast<float>(r.z);
-      x.w += static_cast<float>(r.w);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] += static_cast<float>(r.val[j]);
+      }
     }
 
-    ss += x.x * x.x;
-    ss += x.y * x.y;
-    ss += x.z * x.z;
-    ss += x.w * x.w;
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      ss += x.val[j] * x.val[j];
+    }
   }
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
@@ -203,6 +205,7 @@ __device__ void compute_dynamic_per_token_scales(
 
   constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
 
+  const int VEC_SIZE = 4;
   int32_t const num_vec_elems = hidden_size >> 2;
   float block_absmax_val_maybe = 0.0f;
 
@@ -212,26 +215,25 @@ __device__ void compute_dynamic_per_token_scales(
     vec4_t<scalar_t> const w = vec_weight[i];
 
     vec4_t<float> x;
-    x.x = static_cast<float>(in.x);
-    x.y = static_cast<float>(in.y);
-    x.z = static_cast<float>(in.z);
-    x.w = static_cast<float>(in.w);
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      x.val[j] = static_cast<float>(in.val[j]);
+    }
+
     if constexpr (has_residual) {
       vec4_t<scalar_t> r = vec_residual[i];
-      x.x += static_cast<float>(r.x);
-      x.y += static_cast<float>(r.y);
-      x.z += static_cast<float>(r.z);
-      x.w += static_cast<float>(r.w);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] += static_cast<float>(r.val[j]);
+      }
     }
 
-    block_absmax_val_maybe = fmaxf(
-        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.x * rms) * w.x));
-    block_absmax_val_maybe = fmaxf(
-        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.y * rms) * w.y));
-    block_absmax_val_maybe = fmaxf(
-        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.z * rms) * w.z));
-    block_absmax_val_maybe = fmaxf(
-        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.w * rms) * w.w));
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      block_absmax_val_maybe =
+          fmaxf(block_absmax_val_maybe,
+                fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
+    }
   }
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
@@ -282,6 +284,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
     vec_residual = reinterpret_cast<vec4_t<scalar_t>*>(&residual[token_offset]);
   }
 
+  const int VEC_SIZE = 4;
   int32_t const num_vec_elems = hidden_size >> 2;
 
 // TODO(luka/varun) extract into type-agnostic vectorized quant function to
@@ -292,33 +295,31 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
     vec4_t<scalar_t> const w = vec_weight[i];
 
     vec4_t<float> x;
-    x.x = static_cast<float>(in.x);
-    x.y = static_cast<float>(in.y);
-    x.z = static_cast<float>(in.z);
-    x.w = static_cast<float>(in.w);
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      x.val[j] = static_cast<float>(in.val[j]);
+    }
+
     if constexpr (has_residual) {
       vec4_t<scalar_t> r = vec_residual[i];
-      x.x += static_cast<float>(r.x);
-      x.y += static_cast<float>(r.y);
-      x.z += static_cast<float>(r.z);
-      x.w += static_cast<float>(r.w);
-      // Update residual
-      r.x = static_cast<scalar_t>(x.x);
-      r.y = static_cast<scalar_t>(x.y);
-      r.z = static_cast<scalar_t>(x.z);
-      r.w = static_cast<scalar_t>(x.w);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] += static_cast<float>(r.val[j]);
+      }
+// Update residual
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        r.val[j] = static_cast<scalar_t>(x.val[j]);
+      }
       vec_residual[i] = r;
     }
 
     q8x4_t<scalar_out_t> out;
-    out.x = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
-        static_cast<scalar_t>(x.x * rms) * w.x, scale);
-    out.y = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
-        static_cast<scalar_t>(x.y * rms) * w.y, scale);
-    out.z = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
-        static_cast<scalar_t>(x.z * rms) * w.z, scale);
-    out.w = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
-        static_cast<scalar_t>(x.w * rms) * w.w, scale);
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      out.val[j] = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+          static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale);
+    }
     vec_output[i] = out;
   }
 }
diff --git a/csrc/quantization/vectorization.cuh b/csrc/quantization/vectorization.cuh
index 866da10b5..11d57a5fa 100644
--- a/csrc/quantization/vectorization.cuh
+++ b/csrc/quantization/vectorization.cuh
@@ -10,23 +10,22 @@
 namespace vllm {
 
 // Vectorization containers
-template <typename scalar_t>
-struct __align__(8) vec4_t {
-  scalar_t x;
-  scalar_t y;
-  scalar_t z;
-  scalar_t w;
+template <typename scalar_t, size_t vec_size>
+struct __align__(vec_size * sizeof(scalar_t)) vec_n_t {
+  scalar_t val[vec_size];
 };
 
-template <typename quant_type_t>
-struct __align__(4) q8x4_t {
+template <typename quant_type_t, size_t vec_size>
+struct __align__(vec_size * sizeof(quant_type_t)) q8_n_t {
   static_assert(std::is_same_v<quant_type_t, int8_t> ||
                 std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
                 std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
-  quant_type_t x;
-  quant_type_t y;
-  quant_type_t z;
-  quant_type_t w;
+  quant_type_t val[vec_size];
 };
 
+template <typename scalar_t>
+using vec4_t = vec_n_t<scalar_t, 4>;
+template <typename quant_type_t>
+using q8x4_t = q8_n_t<quant_type_t, 4>;
+
 }  // namespace vllm
-- 
GitLab


From 6865fe0074771ed56c1cb2eca047a8e74ab53ce9 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 3 Jun 2025 22:07:19 +0100
Subject: [PATCH 146/274] Fix interaction between `Optional` and `Annotated` in
 CLI typing (#19093)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikun@apache.org>
---
 tests/engine/test_arg_utils.py | 18 +++++++++++++++---
 vllm/engine/arg_utils.py       | 26 +++++++++++++++++++-------
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index ab78aa7da..cfbc7c245 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -5,14 +5,14 @@ import json
 from argparse import ArgumentError, ArgumentTypeError
 from contextlib import nullcontext
 from dataclasses import dataclass, field
-from typing import Literal, Optional
+from typing import Annotated, Literal, Optional
 
 import pytest
 
 from vllm.config import CompilationConfig, config
 from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
-                                   get_type, is_not_builtin, is_type,
-                                   literal_to_kwargs, nullable_kvs,
+                                   get_type, get_type_hints, is_not_builtin,
+                                   is_type, literal_to_kwargs, nullable_kvs,
                                    optional_type, parse_type)
 from vllm.utils import FlexibleArgumentParser
 
@@ -160,6 +160,18 @@ def test_is_not_builtin(type_hint, expected):
     assert is_not_builtin(type_hint) == expected
 
 
+@pytest.mark.parametrize(
+    ("type_hint", "expected"), [
+        (Annotated[int, "annotation"], {int}),
+        (Optional[int], {int, type(None)}),
+        (Annotated[Optional[int], "annotation"], {int, type(None)}),
+        (Optional[Annotated[int, "annotation"]], {int, type(None)}),
+    ],
+    ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"])
+def test_get_type_hints(type_hint, expected):
+    assert get_type_hints(type_hint) == expected
+
+
 def test_get_kwargs():
     kwargs = get_kwargs(DummyConfig)
     print(kwargs)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 587a23134..2197d44ca 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -15,7 +15,7 @@ from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional,
 
 import regex as re
 import torch
-from pydantic import SkipValidation, TypeAdapter, ValidationError
+from pydantic import TypeAdapter, ValidationError
 from typing_extensions import TypeIs, deprecated
 
 import vllm.envs as envs
@@ -151,17 +151,29 @@ def is_not_builtin(type_hint: TypeHint) -> bool:
     return type_hint.__module__ != "builtins"
 
 
+def get_type_hints(type_hint: TypeHint) -> set[TypeHint]:
+    """Extract type hints from Annotated or Union type hints."""
+    type_hints: set[TypeHint] = set()
+    origin = get_origin(type_hint)
+    args = get_args(type_hint)
+
+    if origin is Annotated:
+        type_hints.update(get_type_hints(args[0]))
+    elif origin is Union:
+        for arg in args:
+            type_hints.update(get_type_hints(arg))
+    else:
+        type_hints.add(type_hint)
+
+    return type_hints
+
+
 def get_kwargs(cls: ConfigType) -> dict[str, Any]:
     cls_docs = get_attr_docs(cls)
     kwargs = {}
     for field in fields(cls):
         # Get the set of possible types for the field
-        type_hints: set[TypeHint] = set()
-        if get_origin(field.type) in {Union, Annotated}:
-            predicate = lambda arg: not isinstance(arg, SkipValidation)
-            type_hints.update(filter(predicate, get_args(field.type)))
-        else:
-            type_hints.add(field.type)
+        type_hints: set[TypeHint] = get_type_hints(field.type)
 
         # If the field is a dataclass, we can use the model_validate_json
         generator = (th for th in type_hints if is_dataclass(th))
-- 
GitLab


From 6cac54f4d1673991a415b9897d610c132104155b Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Wed, 4 Jun 2025 05:41:36 +0800
Subject: [PATCH 147/274] [v1] Re-init input batch for multiple kv cache groups
 (#18654)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/v1/worker/test_gpu_input_batch.py  | 29 ++-------------
 tests/v1/worker/test_gpu_model_runner.py |  4 ++-
 vllm/v1/worker/block_table.py            |  3 +-
 vllm/v1/worker/gpu_input_batch.py        | 18 +++++-----
 vllm/v1/worker/gpu_model_runner.py       | 46 ++++++++++++++++++++----
 vllm/v1/worker/tpu_model_runner.py       |  7 ++--
 6 files changed, 61 insertions(+), 46 deletions(-)

diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index e932e4b32..72547e86b 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -10,8 +10,6 @@ import torch
 
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec, KVCacheTensor)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -25,27 +23,6 @@ CUDA_DEVICES = [
 MAX_NUM_PROMPT_TOKENS = 64
 
 
-def get_kv_cache_config() -> KVCacheConfig:
-    return KVCacheConfig(
-        num_blocks=10,
-        tensors={
-            "layer.0": KVCacheTensor(size=1024),
-        },
-        kv_cache_groups=[
-            KVCacheGroupSpec(
-                layer_names=["layer.0"],
-                kv_cache_spec=FullAttentionSpec(
-                    block_size=1,
-                    num_kv_heads=1,
-                    head_size=16,
-                    dtype=torch.float16,
-                    use_mla=False,
-                ),
-            ),
-        ],
-    )
-
-
 def _compare_objs(obj1, obj2):
     attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a)))
     attr_names = set([
@@ -252,7 +229,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
         device=torch.device(device),
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
-        block_size=1,
+        block_sizes=[1],
     )
     reqs: list[CachedRequestState] = []
     req_id_reqs = {}
@@ -342,7 +319,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
         device=torch.device(device),
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
-        block_size=1,
+        block_sizes=[1],
     )
     ref_input_batch: InputBatch = InputBatch(
         max_num_reqs=batch_size,
@@ -351,7 +328,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
         device=torch.device(device),
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
-        block_size=1,
+        block_sizes=[1],
     )
 
     reqs: list[CachedRequestState] = []
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 5e2fd2fbf..0553d94de 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -54,7 +54,9 @@ def initialize_kv_cache(runner: GPUModelRunner):
         device=runner.device,
         pin_memory=runner.pin_memory,
         vocab_size=runner.model_config.get_vocab_size(),
-        block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size,
+        block_sizes=[
+            kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
+        ],
     )
     runner.initialize_attn_backend(kv_cache_config)
 
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 958262c49..5cd5674fb 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -105,10 +105,11 @@ class MultiGroupBlockTable:
 
     def __init__(self, max_num_reqs: int, max_model_len: int,
                  max_num_batched_tokens: int, pin_memory: bool,
-                 device: torch.device, block_size: int) -> None:
+                 device: torch.device, block_sizes: list[int]) -> None:
         self.block_tables = [
             BlockTable(max_num_reqs, cdiv(max_model_len, block_size),
                        max_num_batched_tokens, pin_memory, device)
+            for block_size in block_sizes
         ]
 
     def append_row(self, block_ids: list[list[int]], row_idx: int) -> None:
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index bb986b604..34737029f 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -56,14 +56,14 @@ class CachedRequestState:
 class InputBatch:
 
     def __init__(
-        self,
-        max_num_reqs: int,
-        max_model_len: int,
-        max_num_batched_tokens: int,
-        device: torch.device,
-        pin_memory: bool,
-        vocab_size: int,
-        block_size: int,
+            self,
+            max_num_reqs: int,
+            max_model_len: int,
+            max_num_batched_tokens: int,
+            device: torch.device,
+            pin_memory: bool,
+            vocab_size: int,
+            block_sizes: list[int],  # The block_size of each kv cache group
     ):
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
@@ -105,7 +105,7 @@ class InputBatch:
             max_num_batched_tokens=max_num_batched_tokens,
             pin_memory=pin_memory,
             device=device,
-            block_size=block_size,
+            block_sizes=block_sizes,
         )
 
         # Sampling-related.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b7448be26..6a566a602 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -143,7 +143,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         self.attn_metadata_builders: list[AttentionMetadataBuilder] = []
         self.attn_backends: list[type[AttentionBackend]] = []
         # self.kv_cache_config: KVCacheConfig
-        # self.input_batch: InputBatch # Persistent batch.
 
         # req_id -> (input_id -> encoder_output)
         self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
@@ -173,6 +172,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
 
+        # Input Batch
+        # NOTE(Chen): Ideally, we should initialize the input batch inside
+        # `initialize_kv_cache` based on the kv cache config. However, as in
+        # https://github.com/vllm-project/vllm/pull/18298, due to some unknown
+        # reasons, we have to initialize the input batch before `load_model`,
+        # quantization + weight offloading will fail otherwise. As a temporary
+        # solution, we initialize the input batch here, and re-initialize it
+        # in `initialize_kv_cache` if the block_sizes here is different from
+        # the block_sizes in the kv cache config.
         self.input_batch = InputBatch(
             max_num_reqs=self.max_num_reqs,
             max_model_len=self.max_model_len,
@@ -180,7 +188,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             device=self.device,
             pin_memory=self.pin_memory,
             vocab_size=self.model_config.get_vocab_size(),
-            block_size=self.cache_config.block_size,
+            block_sizes=[self.cache_config.block_size],
         )
 
         self.use_cuda_graph = (self.vllm_config.compilation_config.level
@@ -2040,6 +2048,35 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             self.attn_backends.append(attn_backend_i)
             self.attn_metadata_builders.append(attn_metadata_builder_i)
 
+    def may_reinitialize_input_batch(self,
+                                     kv_cache_config: KVCacheConfig) -> None:
+        """
+        Re-initialize the input batch if the block sizes are different from
+        `[self.cache_config.block_size]`. This usually happens when there
+        are multiple KV cache groups.
+
+        Args:
+            kv_cache_config: The KV cache configuration.
+        """
+        block_sizes = [
+            kv_cache_group.kv_cache_spec.block_size
+            for kv_cache_group in kv_cache_config.kv_cache_groups
+        ]
+        if block_sizes != [self.cache_config.block_size]:
+            assert self.cache_config.cpu_offload_gb == 0, (
+                "Cannot re-initialize the input batch when CPU weight "
+                "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 "  # noqa: E501
+                "for more details.")
+            self.input_batch = InputBatch(
+                max_num_reqs=self.max_num_reqs,
+                max_model_len=self.max_model_len,
+                max_num_batched_tokens=self.max_num_tokens,
+                device=self.device,
+                pin_memory=self.pin_memory,
+                vocab_size=self.model_config.get_vocab_size(),
+                block_sizes=block_sizes,
+            )
+
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize KV cache based on `kv_cache_config`.
@@ -2047,11 +2084,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             kv_cache_config: Configuration for the KV cache, including the KV
             cache size of each layer
         """
-        if len(kv_cache_config.kv_cache_groups) > 1:
-            raise NotImplementedError(
-                "Hybrid models with more than one KV cache type are not "
-                "supported yet.")
         self.kv_cache_config = kv_cache_config
+        self.may_reinitialize_input_batch(kv_cache_config)
         self.initialize_attn_backend(kv_cache_config)
 
         kv_caches: dict[str, torch.Tensor] = {}
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index f15234f49..73c445d14 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -200,7 +200,7 @@ class TPUModelRunner(LoRAModelRunnerMixin):
             device=self.device,
             pin_memory=self.pin_memory,
             vocab_size=self.model_config.get_vocab_size(),
-            block_size=self.block_size,
+            block_sizes=[self.block_size],
         )
 
         # Cached torch/numpy tensor
@@ -1358,8 +1358,9 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                 device=self.device,
                 pin_memory=self.pin_memory,
                 vocab_size=self.model_config.get_vocab_size(),
-                block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec.
-                block_size,
+                block_sizes=[
+                    kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
+                ],
             )
         # Verify dtype compatibility between block_table_cpu and input_batch
         assert self.block_table_cpu.dtype == self.input_batch.block_table[
-- 
GitLab


From 135cf55cd1d83cd4e18266e343a59e6d9f87856f Mon Sep 17 00:00:00 2001
From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Date: Tue, 3 Jun 2025 18:26:33 -0400
Subject: [PATCH 148/274] [V1][Spec Decode][Ngram] 1.35x gain -> 1.95x gain on
 InstructCoder with prompt fix (#18971)

---
 benchmarks/benchmark_dataset.py | 10 +++++++++-
 vllm/benchmarks/datasets.py     | 14 +++++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 80a9246aa..5d2a26cd4 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -865,7 +865,15 @@ class InstructCoderDataset(HuggingFaceDataset):
         for item in self.data:
             if len(sampled_requests) >= num_requests:
                 break
-            prompt = f"{item['instruction']}:\n{item['input']}"
+            prompt = f"{item['input']}\n\n{item['instruction']} Just output \
+            the code, do not include any explanation."
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
             prompt_len = len(tokenizer(prompt).input_ids)
             sampled_requests.append(
                 SampleRequest(
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 0ef3e0254..f795a1256 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -880,7 +880,19 @@ class InstructCoderDataset(HuggingFaceDataset):
         for item in self.data:
             if len(sampled_requests) >= num_requests:
                 break
-            prompt = f"{item['instruction']}:\n{item['input']}"
+            prompt = f"{item['input']}\n\n{item['instruction']} Just output \
+            the code, do not include any explanation."
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{
+                    "role": "user",
+                    "content": prompt
+                }],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+
             prompt_len = len(tokenizer(prompt).input_ids)
             sampled_requests.append(
                 SampleRequest(
-- 
GitLab


From b5fd9506c14bed640210a7f3d0adb03a024afdbe Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Wed, 4 Jun 2025 06:30:55 +0800
Subject: [PATCH 149/274] [Bugfix] get_num_blocks_to_allocate with null_block
 (#19031)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/v1/core/test_specialized_manager.py    | 23 ++++++++++++++++++++
 vllm/v1/core/block_pool.py                   |  5 +++--
 vllm/v1/core/kv_cache_utils.py               |  3 +++
 vllm/v1/core/single_type_kv_cache_manager.py |  5 +++--
 4 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py
index c6f7481dd..92ce8ea8b 100644
--- a/tests/v1/core/test_specialized_manager.py
+++ b/tests/v1/core/test_specialized_manager.py
@@ -144,3 +144,26 @@ def test_sliding_window_remove_skipped_blocks():
     # of removed blocks should be [1003, 1002].
     manager.remove_skipped_blocks("test", 11)
     assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:])
+
+
+def test_get_num_blocks_to_allocate():
+    block_size = 2
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=block_size,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=4,  # Placeholder value, not related to test result
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    manager = get_sliding_window_manager(sliding_window_spec, block_pool)
+    cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
+    cached_blocks_2 = [block_pool.null_block for _ in range(5)
+                       ] + [KVCacheBlock(i + 1) for i in range(5)]
+
+    assert manager.get_num_blocks_to_allocate("1", 20 * block_size,
+                                              cached_blocks_1) == 20
+    assert manager.get_num_blocks_to_allocate("2", 20 * block_size,
+                                              cached_blocks_2) == 15
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 27eaca497..5118e4d8e 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -63,6 +63,7 @@ class BlockPool:
         # The ref_cnt of null_block is not maintained, needs special care to
         # avoid freeing it.
         self.null_block = self.free_block_queue.popleft()
+        self.null_block.is_null = True
 
         self.enable_kv_cache_events = enable_kv_cache_events
         self.kv_event_queue: list[KVCacheEvent] = []
@@ -252,7 +253,7 @@ class BlockPool:
         for block in blocks:
             # ref_cnt=0 means this block is in the free list (i.e. eviction
             # candidate), so remove it.
-            if block.ref_cnt == 0 and block != self.null_block:
+            if block.ref_cnt == 0 and not block.is_null:
                 self.free_block_queue.remove(block)
             block.incr_ref()
 
@@ -267,7 +268,7 @@ class BlockPool:
         for block in ordered_blocks:
             block.decr_ref()
             # null_block should not be added to the free list.
-            if block.ref_cnt == 0 and block != self.null_block:
+            if block.ref_cnt == 0 and not block.is_null:
                 self.free_block_queue.append(block)
 
     def reset_prefix_cache(self) -> bool:
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 61476362e..3b5a37926 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -125,6 +125,9 @@ class KVCacheBlock:
     prev_free_block: Optional["KVCacheBlock"] = None
     next_free_block: Optional["KVCacheBlock"] = None
 
+    # Whether the block is a null block that should never be cached.
+    is_null: bool = False
+
     def incr_ref(self):
         self.ref_cnt += 1
 
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index 233c73e88..a529cde09 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -83,8 +83,9 @@ class SingleTypeKVCacheManager(ABC):
         # free queue and ref_cnt == 0), it will be changed from a free block
         # to a computed block when the request is allocated, so we also count
         # it as needed to be allocated.
-        num_evictable_computed_blocks = sum(blk.ref_cnt == 0
-                                            for blk in new_computed_blocks)
+        num_evictable_computed_blocks = sum(
+            blk.ref_cnt == 0 and not blk.is_null
+            for blk in new_computed_blocks)
         return ((num_new_blocks + num_evictable_computed_blocks) *
                 self.num_kv_cache_groups)
 
-- 
GitLab


From 4de790fcad85abb0969da18bc9125889407c432a Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 4 Jun 2025 07:27:24 +0800
Subject: [PATCH 150/274] [Bugfix]: Fix the incompatibility issue with
 tool_choice 'required' when Thinking is enabled (#19075)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 .../test_completion_with_function_calling.py   |  2 +-
 vllm/entrypoints/openai/serving_chat.py        | 18 +++++++++++++++---
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index dbea2dc0b..5c1f07832 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -9,7 +9,7 @@ import pytest_asyncio
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 7e514d660..777b7f5bc 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -320,10 +320,13 @@ class OpenAIServingChat(OpenAIServing):
     def extract_tool_call_required_streaming(
         self,
         previous_text: str,
-        current_text: str,
+        current_text: Optional[str],
         delta_text: str,
         function_name_returned: bool,
     ) -> tuple[Optional[DeltaMessage], bool]:
+        if current_text is None or current_text == "":
+            # if the current text is empty, we cannot parse it
+            return None, function_name_returned
         try:
             obj = partial_json_parser.loads(current_text)
         except partial_json_parser.core.exceptions.MalformedJSON:
@@ -650,10 +653,18 @@ class OpenAIServingChat(OpenAIServing):
                         current_text = previous_text + delta_text
                         fn_name_returned = function_name_returned[i]
 
+                        if self.reasoning_parser:
+                            _, content = \
+                                reasoning_parser.extract_reasoning_content(
+                                    current_text,
+                                    request
+                                )
+                        else:
+                            content = current_text
                         delta_message, function_name_returned[i] = (
                             self.extract_tool_call_required_streaming(
                                 previous_text=previous_text,
-                                current_text=current_text,
+                                current_text=content,
                                 delta_text=delta_text,
                                 function_name_returned=fn_name_returned))
 
@@ -981,8 +992,9 @@ class OpenAIServingChat(OpenAIServing):
 
                 # the fields of FunctionDefinition are a superset of the
                 # tool call outputs and can be used for parsing
+                assert content is not None
                 tool_calls = TypeAdapter(
-                    list[FunctionDefinition]).validate_json(output.text)
+                    list[FunctionDefinition]).validate_json(content)
                 message = ChatMessage(
                     role=role,
                     content="",
-- 
GitLab


From 5d96533e2235c37e64ef381fafa244db197b25dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Wed, 4 Jun 2025 01:53:16 +0200
Subject: [PATCH 151/274] [Bugfix][P/D] Fix Prefix Cache Bug (#18411)

Signed-off-by: nicklucche <nlucches@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
---
 vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 3f0b0e295..fd2228012 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -739,7 +739,8 @@ class NixlConnectorWorker:
         # just notify P worker that we have the blocks we need.
         num_local_blocks = len(local_block_ids)
         if num_local_blocks == 0:
-            self.nixl_wrapper.send_notif(dst_engine_id,
+            agent_name = self._remote_agents[dst_engine_id]
+            self.nixl_wrapper.send_notif(agent_name,
                                          notif_msg=request_id.encode("utf-8"))
             return
 
-- 
GitLab


From a8da78eac92b5e79947a6fdd51bec0d1e5cea0a7 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Wed, 4 Jun 2025 08:14:06 +0800
Subject: [PATCH 152/274] [Bugfix] Max concurrency estimation and
 check_enough_kv_cache_memory for models with sliding window layers (#19029)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/v1/core/test_kv_cache_utils.py | 90 +++++++++++++++++++++++++---
 vllm/v1/core/kv_cache_utils.py       | 61 +++++++++++++------
 2 files changed, 125 insertions(+), 26 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index ad34becb1..71ea43383 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -12,13 +12,11 @@ from vllm.utils import GiB_bytes, sha256
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 # disable yapf here as it formats differently than isort such that both fail
 # yapf: disable
-from vllm.v1.core.kv_cache_utils import (FreeKVCacheBlockQueue, KVCacheBlock,
-                                         PrefixCachingMetrics,
-                                         estimate_max_model_len,
-                                         generate_block_hash_extra_keys,
-                                         hash_block_tokens,
-                                         hash_request_tokens,
-                                         unify_kv_cache_configs)
+from vllm.v1.core.kv_cache_utils import (
+    FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
+    estimate_max_model_len, generate_block_hash_extra_keys,
+    get_max_concurrency_for_kv_cache_config, hash_block_tokens,
+    hash_request_tokens, unify_kv_cache_configs)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheTensor,
                                         SlidingWindowSpec)
@@ -597,6 +595,84 @@ def test_estimate_max_model_len(model_id, max_model_len,
     assert estimated_max_len == want_estimated_max_len
 
 
+def test_get_max_concurrency_for_kv_cache_config():
+    # Create a VllmConfig
+    model_id = "Qwen/Qwen1.5-7B"
+    max_model_len = 16384
+    model_config = ModelConfig(
+        model_id,
+        task="generate",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        max_model_len=max_model_len,
+    )
+    scheduler_config = SchedulerConfig(max_num_batched_tokens=1024,
+                                       enable_chunked_prefill=True)
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        scheduler_config=scheduler_config,
+    )
+
+    full_attention_spec = FullAttentionSpec(
+        block_size=16,
+        num_kv_heads=32,
+        head_size=128,
+        dtype=torch.float16,
+        use_mla=False,
+    )
+
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=16,
+        num_kv_heads=32,
+        head_size=128,
+        dtype=torch.float16,
+        use_mla=False,
+        sliding_window=1024,
+    )
+
+    kv_cache_config_full_attention = KVCacheConfig(
+        num_blocks=int(1024 * 1.5),
+        tensors={},
+        kv_cache_groups=[
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
+                             full_attention_spec),
+        ],
+    )
+    max_concurrency_full_attention = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config_full_attention)
+    assert max_concurrency_full_attention == 1.5
+
+    kv_cache_config_sliding_window = KVCacheConfig(
+        num_blocks=129 * 3,
+        tensors={},
+        kv_cache_groups=[
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
+                             sliding_window_spec),
+        ],
+    )
+    max_concurrency_sliding_window = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config_sliding_window)
+    assert max_concurrency_sliding_window == 3
+
+    kv_cache_config_hybrid_model = KVCacheConfig(
+        num_blocks=(1024 + 129) * 3,
+        tensors={},
+        kv_cache_groups=[
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
+                             full_attention_spec),
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32, 64)],
+                             sliding_window_spec),
+        ],
+    )
+    max_concurrency_hybrid_model = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config_hybrid_model)
+    assert max_concurrency_hybrid_model == 3
+
+
 def test_allocate_with_lookahead():
     """Verify that lookahead tokens correctly affect block allocation"""
     block_size = 4
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 3b5a37926..ad3c21f79 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -3,13 +3,13 @@
 """KV-Cache Utilities."""
 import os
 from collections import deque
-from collections.abc import Sequence
+from collections.abc import Iterable, Sequence
 from dataclasses import dataclass
 from typing import Any, Callable, NamedTuple, Optional
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import GiB_bytes, sha256
+from vllm.utils import GiB_bytes, cdiv, sha256
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheSpec,
                                         KVCacheTensor, SlidingWindowSpec)
@@ -468,6 +468,15 @@ def hash_request_tokens(hash_function: Any, block_size: int,
     return ret
 
 
+def max_memory_usage_bytes(vllm_config: VllmConfig,
+                           kv_cache_specs: Iterable[KVCacheSpec]) -> int:
+    """
+    Get the maximum memory usage in bytes for the given KV cache specs.
+    """
+    return sum(
+        spec.max_memory_usage_bytes(vllm_config) for spec in kv_cache_specs)
+
+
 def estimate_max_model_len(vllm_config: VllmConfig,
                            kv_cache_spec: dict[str, KVCacheSpec],
                            available_memory: int) -> int:
@@ -489,11 +498,8 @@ def estimate_max_model_len(vllm_config: VllmConfig,
         # Modify the max_model_len for this calculation
         vllm_config.model_config.max_model_len = model_len
         # Calculate memory needed for the given model length
-        memory_needed = sum(
-            (layer_spec.max_memory_usage_bytes(vllm_config)
-             for layer_spec in kv_cache_spec.values()),
-            start=0,
-        )
+        memory_needed = max_memory_usage_bytes(vllm_config,
+                                               kv_cache_spec.values())
         return memory_needed <= available_memory
 
     # Binary search for the maximum model length
@@ -538,9 +544,7 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
                          "initializing the engine.")
 
     max_model_len = vllm_config.model_config.max_model_len
-    needed_memory = 0
-    for layer_spec in kv_cache_spec.values():
-        needed_memory += layer_spec.max_memory_usage_bytes(vllm_config)
+    needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
 
     if needed_memory > available_memory:
         # Estimate the maximum model length that can fit in the available memory
@@ -606,6 +610,24 @@ def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
     return len(layer_keys) == 1
 
 
+def get_max_concurrency_for_kv_cache_config(
+        vllm_config: VllmConfig, kv_cache_config: KVCacheConfig) -> float:
+    """
+    Get the maximum concurrency for the given KV cache configuration.
+    """
+    num_layer_per_group = max(
+        len(group.layer_names) for group in kv_cache_config.kv_cache_groups)
+    max_memory_usage_per_request = num_layer_per_group * max_memory_usage_bytes(
+        vllm_config,
+        (group.kv_cache_spec for group in kv_cache_config.kv_cache_groups))
+    memory_per_block = kv_cache_config.kv_cache_groups[
+        0].kv_cache_spec.page_size_bytes * num_layer_per_group
+    num_block_per_request = cdiv(max_memory_usage_per_request,
+                                 memory_per_block)
+    max_concurrency = kv_cache_config.num_blocks / num_block_per_request
+    return max_concurrency
+
+
 def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
                                       kv_cache_spec: dict[str, KVCacheSpec],
                                       available_memory: int) -> KVCacheConfig:
@@ -637,14 +659,6 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
             "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
         num_blocks = num_gpu_blocks_override
 
-    num_tokens = num_blocks * vllm_config.cache_config.block_size
-    num_tokens_str = f"{num_tokens:,}"
-    logger.info("GPU KV cache size: %s tokens", num_tokens_str)
-    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
-    max_concurrency = num_tokens / vllm_config.model_config.max_model_len
-    logger.info("Maximum concurrency for %s tokens per request: %.2fx",
-                max_model_len_str, max_concurrency)
-
     per_layer_size = page_size * num_blocks
     # All layers have the same KV cache spec, so we create one kv cache group
     # for all layers.
@@ -659,6 +673,15 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
         kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec,
                                                     grouped_layer_names),
     )
+
+    num_tokens = num_blocks * vllm_config.cache_config.block_size
+    num_tokens_str = f"{num_tokens:,}"
+    logger.info("GPU KV cache size: %s tokens", num_tokens_str)
+    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
+    max_concurrency = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config)
+    logger.info("Maximum concurrency for %s tokens per request: %.2fx",
+                max_model_len_str, max_concurrency)
     return kv_cache_config
 
 
@@ -705,8 +728,8 @@ def get_kv_cache_config(vllm_config: VllmConfig,
     Returns:
         The generated KVCacheConfigs
     """
-    check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
     unify_hybrid_kv_cache_specs(kv_cache_spec)
+    check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
     if is_kv_cache_type_uniform(kv_cache_spec):
         # KV cache of all layers are the same, which is true for
         # most models. Allocate the same amount of memory for
-- 
GitLab


From b712be98c790794479030313f2c2b9dae17ea7de Mon Sep 17 00:00:00 2001
From: Yan Ru Pei <yanrpei@gmail.com>
Date: Tue, 3 Jun 2025 17:14:20 -0700
Subject: [PATCH 153/274] feat: add data parallel rank to KVEventBatch (#18925)

---
 .buildkite/test-pipeline.yaml              |   2 +
 tests/distributed/conftest.py              | 107 +++++++-----
 tests/distributed/test_events.py           |  69 +++++++-
 tests/v1/engine/test_engine_core_client.py | 189 +++++++++++++++++----
 vllm/distributed/kv_events.py              |  77 ++++++++-
 vllm/v1/core/sched/scheduler.py            |   4 +-
 6 files changed, 362 insertions(+), 86 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5fb8ceaac..8ab96b3b7 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -145,6 +145,7 @@ steps:
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
   - tests/v1/test_async_llm_dp.py
+  - tests/v1/engine/test_engine_core_client.py
   commands:
   # test with tp=2 and external_dp=2
   - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@@ -154,6 +155,7 @@ steps:
   # test with internal dp
   - python3 ../examples/offline_inference/data_parallel.py
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py
index 95f085788..666a715cc 100644
--- a/tests/distributed/conftest.py
+++ b/tests/distributed/conftest.py
@@ -13,11 +13,13 @@ from vllm.distributed.kv_events import EventPublisherFactory
 
 from .test_events import SampleBatch
 
+DP_RANK = 0
+
 
 @pytest.fixture
 def random_port():
     """Generate a random port number for testing"""
-    return random.randint(10000, 60000)
+    return random.randint(10000, 59900)
 
 
 @pytest.fixture
@@ -30,21 +32,23 @@ def publisher_config(random_port, request):
         replay_endpoint = endpoint + "-replay"
     else:
         endpoint = f"tcp://*:{random_port}"
-        replay_endpoint = f"tcp://*:{random_port + 1}"
+        replay_endpoint = f"tcp://*:{random_port + 100}"
 
-    return KVEventsConfig(enable_kv_cache_events=True,
-                          publisher="zmq",
-                          endpoint=endpoint,
-                          replay_endpoint=replay_endpoint,
-                          buffer_steps=100,
-                          hwm=1000,
-                          topic="test")
+    return KVEventsConfig(
+        enable_kv_cache_events=True,
+        publisher="zmq",
+        endpoint=endpoint,
+        replay_endpoint=replay_endpoint,
+        buffer_steps=100,
+        hwm=1000,
+        topic="test",
+    )
 
 
 @pytest.fixture
 def publisher(publisher_config):
     """Create and return a publisher instance"""
-    pub = EventPublisherFactory.create(publisher_config)
+    pub = EventPublisherFactory.create(publisher_config, DP_RANK)
     yield pub
     pub.shutdown()
 
@@ -60,7 +64,11 @@ def subscriber(publisher_config):
     if replay_endpoint and replay_endpoint.startswith("tcp://*"):
         replay_endpoint = replay_endpoint.replace("*", "127.0.0.1")
 
-    sub = MockSubscriber(endpoint, replay_endpoint, publisher_config.topic)
+    sub = MockSubscriber(
+        [endpoint],
+        [replay_endpoint] if replay_endpoint else None,
+        publisher_config.topic,
+    )
     yield sub
     sub.close()
 
@@ -68,26 +76,37 @@ def subscriber(publisher_config):
 class MockSubscriber:
     """Helper class to receive and verify published events"""
 
-    def __init__(self,
-                 pub_endpoint: str,
-                 replay_endpoint: Optional[str] = None,
-                 topic: str = "",
-                 decode_type=SampleBatch):
+    def __init__(
+        self,
+        pub_endpoints: Union[str, list[str]],
+        replay_endpoints: Optional[Union[str, list[str]]] = None,
+        topic: str = "",
+        decode_type=SampleBatch,
+    ):
         self.ctx = zmq.Context.instance()
 
-        # Set up subscriber socket
-        self.sub = self.ctx.socket(zmq.SUB)
-        self.sub.setsockopt(zmq.SUBSCRIBE, topic.encode('utf-8'))
-        self.sub.connect(pub_endpoint)
+        # Convert single endpoint to list for consistency
+        if isinstance(pub_endpoints, str):
+            pub_endpoints = [pub_endpoints]
+        if isinstance(replay_endpoints, str):
+            replay_endpoints = [replay_endpoints]
 
-        # Set up replay socket if provided
-        self.replay = None
-        if replay_endpoint:
-            self.replay = self.ctx.socket(zmq.REQ)
-            self.replay.connect(replay_endpoint)
+        # Set up subscriber socket - connect to all endpoints
+        self.sub = self.ctx.socket(zmq.SUB)
+        self.sub.setsockopt(zmq.SUBSCRIBE, topic.encode("utf-8"))
+        for endpoint in pub_endpoints:
+            self.sub.connect(endpoint)
+
+        # Set up replay sockets if provided
+        self.replay_sockets = []
+        if replay_endpoints:
+            for replay_endpoint in replay_endpoints:
+                replay = self.ctx.socket(zmq.REQ)
+                replay.connect(replay_endpoint)
+                self.replay_sockets.append(replay)
 
         self.topic = topic
-        self.topic_bytes = topic.encode('utf-8')
+        self.topic_bytes = topic.encode("utf-8")
         self.received_msgs: list[tuple[int, SampleBatch]] = []
         self.last_seq = -1
         self.decoder = msgspec.msgpack.Decoder(type=decode_type)
@@ -107,25 +126,31 @@ class MockSubscriber:
         self.received_msgs.append((seq, data))
         return seq, data
 
-    def request_replay(self, start_seq: int) -> None:
+    def request_replay(self, start_seq: int, socket_idx: int = 0) -> None:
         """Request replay of messages starting from start_seq"""
-        if not self.replay:
-            raise ValueError("Replay socket not initialized")
-
-        self.replay.send(start_seq.to_bytes(8, "big"))
-
-    def receive_replay(self) -> list[tuple[int, SampleBatch]]:
-        """Receive replayed messages"""
-        if not self.replay:
-            raise ValueError("Replay socket not initialized")
-
+        if not self.replay_sockets:
+            raise ValueError("Replay sockets not initialized")
+        if socket_idx >= len(self.replay_sockets):
+            raise ValueError(f"Invalid socket index {socket_idx}")
+
+        self.replay_sockets[socket_idx].send(start_seq.to_bytes(8, "big"))
+
+    def receive_replay(self,
+                       socket_idx: int = 0) -> list[tuple[int, SampleBatch]]:
+        """Receive replayed messages from a specific replay socket"""
+        if not self.replay_sockets:
+            raise ValueError("Replay sockets not initialized")
+        if socket_idx >= len(self.replay_sockets):
+            raise ValueError(f"Invalid socket index {socket_idx}")
+
+        replay_socket = self.replay_sockets[socket_idx]
         replayed: list[tuple[int, SampleBatch]] = []
         while True:
             try:
-                if not self.replay.poll(1000):
+                if not replay_socket.poll(1000):
                     break
 
-                frames = self.replay.recv_multipart()
+                frames = replay_socket.recv_multipart()
                 if not frames or not frames[-1]:
                     # End of replay marker
                     break
@@ -142,5 +167,5 @@ class MockSubscriber:
     def close(self):
         """Clean up resources"""
         self.sub.close()
-        if self.replay:
-            self.replay.close()
+        for replay in self.replay_sockets:
+            replay.close()
diff --git a/tests/distributed/test_events.py b/tests/distributed/test_events.py
index ec1e5a2d6..8be9ee0a1 100644
--- a/tests/distributed/test_events.py
+++ b/tests/distributed/test_events.py
@@ -9,6 +9,8 @@ import pytest
 from vllm.distributed.kv_events import (EventBatch, EventPublisherFactory,
                                         NullEventPublisher)
 
+DP_RANK = 0
+
 
 class EventSample(
         msgspec.Struct,
@@ -121,7 +123,7 @@ def test_topic_filtering(publisher_config):
     publisher_config.replay_endpoint = None
 
     publisher_config.topic = "foo"
-    pub = EventPublisherFactory.create(publisher_config)
+    pub = EventPublisherFactory.create(publisher_config, DP_RANK)
 
     from .conftest import MockSubscriber
     sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo")
@@ -185,9 +187,72 @@ def test_high_volume(publisher, subscriber):
 
 def test_null_publisher():
     """Test that NullEventPublisher can be used without errors"""
-    publisher = NullEventPublisher()
+    publisher = NullEventPublisher(DP_RANK)
 
     # This should not raise any errors
     batch = create_test_events(5)
     publisher.publish(batch)
     publisher.shutdown()
+
+
+def test_data_parallel_rank_tagging(publisher_config):
+    """Test that events are properly tagged with their data parallel rank"""
+
+    publisher_config.topic = "foo"
+    pub_0 = EventPublisherFactory.create(publisher_config, DP_RANK)
+    pub_1 = EventPublisherFactory.create(publisher_config, DP_RANK + 1)
+
+    # Hardcode the expected endpoints based on port offsetting behavior
+    # Both ranks get offsets according to _offset_endpoint_port function
+    base_endpoint = publisher_config.endpoint
+    if "tcp://" in base_endpoint:
+        # For TCP endpoints: tcp://localhost:5557 -> tcp://localhost:5557, tcp://localhost:5558
+        expected_endpoint_0 = base_endpoint  # rank 0 gets port + 0 = same port
+        expected_endpoint_1 = base_endpoint.replace(
+            ":5557", ":5558")  # rank 1 gets port + 1
+    else:
+        # For inproc endpoints: inproc://test -> inproc://test_dp0, inproc://test_dp1
+        expected_endpoint_0 = base_endpoint  # rank 0 gets base
+        expected_endpoint_1 = base_endpoint + "_dp1"  # rank 1 gets _dp1
+
+    from .conftest import MockSubscriber
+    sub_0 = MockSubscriber(expected_endpoint_0, None, publisher_config.topic)
+    sub_1 = MockSubscriber(expected_endpoint_1, None, publisher_config.topic)
+
+    try:
+        time.sleep(0.1)  # Let publishers start up
+
+        # Publish events from different ranks
+        batch_0 = create_test_events(2)
+        batch_1 = create_test_events(3)
+
+        pub_0.publish(batch_0)
+        pub_1.publish(batch_1)
+
+        # Receive events from rank 0
+        result_0 = sub_0.receive_one(timeout=200)
+        assert result_0 is not None, "No message received from rank 0"
+        seq_0, received_0 = result_0
+
+        # Receive events from rank 1
+        result_1 = sub_1.receive_one(timeout=200)
+        assert result_1 is not None, "No message received from rank 1"
+        seq_1, received_1 = result_1
+
+        # Verify DP rank tagging
+        assert received_0.data_parallel_rank == 0, (
+            f"Expected DP rank 0, got {received_0.data_parallel_rank}")
+        assert received_1.data_parallel_rank == 1, (
+            f"Expected DP rank 1, got {received_1.data_parallel_rank}")
+
+        # Verify event content is correct
+        assert len(
+            received_0.events) == 2, "Wrong number of events from rank 0"
+        assert len(
+            received_1.events) == 3, "Wrong number of events from rank 1"
+
+    finally:
+        pub_0.shutdown()
+        pub_1.shutdown()
+        sub_0.close()
+        sub_1.close()
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index a01b205df..47181d36f 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -12,8 +12,10 @@ from typing import Optional
 import pytest
 from transformers import AutoTokenizer
 
+from tests.utils import multi_gpu_test
 from vllm import SamplingParams
-from vllm.distributed.kv_events import BlockStored, KVEventBatch
+from vllm.distributed.kv_events import (BlockStored, KVEventBatch,
+                                        ZmqEventPublisher)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.usage.usage_lib import UsageContext
@@ -37,10 +39,15 @@ PROMPT = "Hello my name is Robert and I love quantization kernels"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 
 
-def make_request(params: SamplingParams) -> EngineCoreRequest:
+def make_request(
+        params: SamplingParams,
+        prompt_tokens_ids: Optional[list[int]] = None) -> EngineCoreRequest:
+    if not prompt_tokens_ids:
+        prompt_tokens_ids = PROMPT_TOKENS
+
     return EngineCoreRequest(
         request_id=str(uuid.uuid4()),
-        prompt_token_ids=PROMPT_TOKENS,
+        prompt_token_ids=prompt_tokens_ids,
         mm_inputs=None,
         mm_hashes=None,
         mm_placeholders=None,
@@ -88,6 +95,25 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: dict):
             break
 
 
+async def loop_until_fully_done_async(client: EngineCoreClient, outputs: dict):
+
+    while True:
+        engine_core_outputs = (await client.get_output_async()).outputs
+
+        if len(engine_core_outputs) == 0:
+            continue
+
+        # Add outputs to the dict
+        for out in engine_core_outputs:
+            outputs[out.request_id].append(out)
+
+        # Check if all request IDs in outputs have finished
+        if all(outs and outs[-1].finished for outs in outputs.values()):
+            break
+
+        await asyncio.sleep(0.1)
+
+
 # Dummy utility function to monkey-patch into engine core.
 def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
     print(f"echo util function called: {msg}, {err_msg}")
@@ -273,10 +299,12 @@ def test_kv_cache_events(
         block_size = 16
         num_blocks = 2
 
-        engine_args = EngineArgs(model=MODEL_NAME,
-                                 enforce_eager=True,
-                                 enable_prefix_caching=True,
-                                 block_size=block_size)
+        engine_args = EngineArgs(
+            model=MODEL_NAME,
+            enforce_eager=True,
+            enable_prefix_caching=True,
+            block_size=block_size,
+        )
         engine_args.kv_events_config = publisher_config
 
         vllm_config = engine_args.create_engine_config(
@@ -297,19 +325,8 @@ def test_kv_cache_events(
 
         try:
             custom_tokens = list(range(num_blocks * block_size))
-            request = EngineCoreRequest(
-                request_id=str(uuid.uuid4()),
-                prompt_token_ids=custom_tokens,
-                mm_inputs=None,
-                mm_hashes=None,
-                mm_placeholders=None,
-                sampling_params=SamplingParams(
-                    max_tokens=1),  # Short completion for speed
-                eos_token_id=None,
-                arrival_time=time.time(),
-                lora_request=None,
-                cache_salt=None,
-            )
+            sampling_params = SamplingParams(max_tokens=1)
+            request = make_request(sampling_params, custom_tokens)
             client.add_request(request)
 
             outputs: dict[str, list] = {request.request_id: []}
@@ -321,24 +338,130 @@ def test_kv_cache_events(
             seq, received = result
 
             assert seq == 0, "Sequence number mismatch"
-            assert len(received.events) == 1, (
-                "We should have exactly one BlockStored event")
+            assert (len(received.events) == 1
+                    ), "We should have exactly one BlockStored event"
             event = received.events[0]
             assert isinstance(
-                event, BlockStored), ("We should have a BlockStored event")
-            assert len(event.block_hashes) == num_blocks, (
-                "We should have a BlockStored event with 2 block_hashes")
-            assert event.block_size == block_size, (
-                "Block size should be the same as the block size")
-            assert event.parent_block_hash is None, (
-                "Parent block hash should be None")
+                event, BlockStored), "We should have a BlockStored event"
+            assert (len(event.block_hashes) == num_blocks
+                    ), "We should have a BlockStored event with 2 block_hashes"
+            assert (event.block_size == block_size
+                    ), "Block size should be the same as the block size"
+            assert (event.parent_block_hash
+                    is None), "Parent block hash should be None"
             assert event.lora_id is None, "Lora id should be None"
-            assert len(event.token_ids) == num_blocks * block_size, (
-                "Token ids should be the same as the custom tokens")
-            assert event.token_ids == custom_tokens, (
-                "Token ids should be the same as the custom tokens")
+            assert (len(event.token_ids) == num_blocks * block_size
+                    ), "Token ids should be the same as the custom tokens"
+            assert (event.token_ids == custom_tokens
+                    ), "Token ids should be the same as the custom tokens"
+        finally:
+            client.shutdown()
+            subscriber.close()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "multiprocessing_mode,publisher_config",
+    [(True, "tcp")],
+    indirect=["publisher_config"],
+)
+@multi_gpu_test(num_gpus=4)
+async def test_kv_cache_events_dp(
+    monkeypatch: pytest.MonkeyPatch,
+    multiprocessing_mode: bool,
+    publisher_config,
+):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        block_size = 16
+        num_blocks = 2
+        dp_size = 2
+        tp_size = 2
+
+        engine_args = EngineArgs(
+            model=MODEL_NAME,
+            enforce_eager=True,
+            enable_prefix_caching=True,
+            data_parallel_size=dp_size,
+            tensor_parallel_size=tp_size,
+            block_size=block_size,
+        )
+        engine_args.kv_events_config = publisher_config
+
+        vllm_config = engine_args.create_engine_config(
+            UsageContext.UNKNOWN_CONTEXT)
+
+        executor_class = Executor.get_class(vllm_config)
+        client = EngineCoreClient.make_client(
+            multiprocess_mode=multiprocessing_mode,
+            asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=False,
+        )
+        await asyncio.sleep(1)
+
+        # Build endpoints for all DP ranks
+        base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
+        endpoints = []
+        for i in range(dp_size):
+            offset_endpoint = ZmqEventPublisher.offset_endpoint_port(
+                base_endpoint, i)
+            endpoints.append(offset_endpoint)
+
+        subscriber = MockSubscriber(endpoints,
+                                    topic=publisher_config.topic,
+                                    decode_type=KVEventBatch)
+
+        try:
+            custom_tokens = list(range(num_blocks * block_size))
+            sampling_params = SamplingParams(max_tokens=1)
+            all_request_ids = []
+
+            # Create and add 25 requests
+            # NOTE: attempts to force routing to both dp groups but can be flaky
+            for i in range(25):
+                await asyncio.sleep(0.01)
+                request = make_request(sampling_params, custom_tokens)
+                await client.add_request_async(request)
+                all_request_ids.append(request.request_id)
+
+            await asyncio.sleep(0.1)
+
+            # Initialize outputs dict for all requests
+            outputs: dict[str, list] = {
+                req_id: []
+                for req_id in all_request_ids
+            }
+
+            print("processing requests...")
+            await asyncio.wait_for(loop_until_fully_done_async(
+                client, outputs),
+                                   timeout=20.0)
+
+            # Receive from subscriber until no more messages
+            print("collecting results...")
+            results = []
+            while True:
+                result = subscriber.receive_one(timeout=1)
+                print(result)
+                if result is None:
+                    break
+                results.append(result)
+
+            # Collect all events and data_parallel_ranks from all results
+            all_dp_ranks = [
+                received.data_parallel_rank for (_, received) in results
+            ]
+            unique_dps = set(all_dp_ranks)
+            assert (
+                len(unique_dps) == 2
+            ), f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
+
         finally:
             client.shutdown()
+            subscriber.close()
 
 
 @pytest.mark.timeout(20)
diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py
index 9bf1c058a..2d7935773 100644
--- a/vllm/distributed/kv_events.py
+++ b/vllm/distributed/kv_events.py
@@ -28,6 +28,7 @@ class EventBatch(
 ):
     ts: float
     events: list[Any]
+    data_parallel_rank: Optional[int] = None
 
 
 class KVCacheEvent(
@@ -60,7 +61,22 @@ class KVEventBatch(EventBatch):
 
 
 class EventPublisher(ABC):
-    """Lightweight publisher for EventBatch batches."""
+    """Lightweight publisher for EventBatch batches with data parallelism
+    support.
+    
+    In data parallel setups, each DP rank runs its own EventPublisher instance
+    to avoid duplicate events and ensure proper event attribution:
+    
+    - Each DP rank creates a separate publisher
+    - Publishers automatically annotate events with their data_parallel_rank
+    - This allows consumers to distinguish events from different DP ranks
+    
+    The publisher is responsible for adding DP metadata since the scheduler
+    operates independently of DP topology and shouldn't need DP awareness.
+    """
+
+    def __init__(self, data_parallel_rank: int = 0) -> None:
+        self._data_parallel_rank = data_parallel_rank
 
     @abstractmethod
     def publish(self, events: EventBatch) -> None:
@@ -113,6 +129,7 @@ class ZmqEventPublisher(EventPublisher):
 
     def __init__(
         self,
+        data_parallel_rank: int,
         endpoint: str = "tcp://*:5557",
         replay_endpoint: Optional[str] = None,
         buffer_steps: int = 10_000,
@@ -121,6 +138,7 @@ class ZmqEventPublisher(EventPublisher):
         topic: str = "",
     ) -> None:
         # Storage
+        super().__init__(data_parallel_rank)
         self._event_queue = Queue[Optional[EventBatch]](maxsize=max_queue_size)
         self._buffer = deque[tuple[int, bytes]](maxlen=buffer_steps)
 
@@ -128,8 +146,11 @@ class ZmqEventPublisher(EventPublisher):
         self._ctx = zmq.Context.instance()
         self._pub: Optional[zmq.Socket] = None
         self._replay: Optional[zmq.Socket] = None
-        self._endpoint = endpoint
-        self._replay_endpoint = replay_endpoint
+        self._dp_rank = data_parallel_rank
+
+        self._endpoint = self.offset_endpoint_port(endpoint, self._dp_rank)
+        self._replay_endpoint = self.offset_endpoint_port(
+            replay_endpoint, self._dp_rank)
         self._hwm = hwm
         self._socket_setup()
 
@@ -149,6 +170,8 @@ class ZmqEventPublisher(EventPublisher):
     def publish(self, events: EventBatch) -> None:
         if not self._running:
             raise RuntimeError("Publisher is closed")
+        if events.data_parallel_rank is None:
+            events.data_parallel_rank = self._data_parallel_rank
         self._event_queue.put(events)
 
     def shutdown(self) -> None:
@@ -191,11 +214,12 @@ class ZmqEventPublisher(EventPublisher):
             self._pub.set_hwm(self._hwm)
             # Heuristic: bind if wildcard / * present, else connect.
             # bind stable, connect volatile convention
-            if ("*" in self._endpoint or "::" in self._endpoint
-                    or self._endpoint.startswith("ipc://")
-                    or self._endpoint.startswith("inproc://")):
+            if (self._endpoint is not None
+                    and ("*" in self._endpoint or "::" in self._endpoint
+                         or self._endpoint.startswith("ipc://")
+                         or self._endpoint.startswith("inproc://"))):
                 self._pub.bind(self._endpoint)
-            else:
+            elif self._endpoint is not None:
                 self._pub.connect(self._endpoint)
 
         # Set up replay socket: use ROUTER
@@ -266,6 +290,38 @@ class ZmqEventPublisher(EventPublisher):
         # receiving payload is (-1, b""")
         self._replay.send_multipart((client_id, b"", self.END_SEQ, b""))
 
+    @staticmethod
+    def offset_endpoint_port(endpoint: Optional[str],
+                             data_parallel_rank: int) -> Optional[str]:
+        """Helper function to offset the port in an endpoint by 
+            the data parallel rank.
+
+        Args:
+            endpoint: The endpoint string 
+                (e.g., "tcp://*:5557" or "inproc://cache")
+            data_parallel_rank: The data parallel rank to offset by
+
+        Returns:
+            The endpoint with the port offset by data_parallel_rank 
+                or suffix appended
+        """
+        # Do nothing if input is None or data_parallel_rank is 0
+        if not endpoint or data_parallel_rank == 0:
+            return endpoint
+
+        if "inproc" in endpoint:
+            return f"{endpoint}_dp{data_parallel_rank}"
+        if "tcp" in endpoint:
+            if endpoint and ":" in endpoint:
+                # Get everything after the last colon (the port)
+                last_colon_idx = endpoint.rfind(":")
+                base_addr = endpoint[:last_colon_idx]
+                base_port = int(endpoint[last_colon_idx + 1:])
+                new_port = base_port + data_parallel_rank
+                return f"{base_addr}:{new_port}"
+            return endpoint
+        raise ValueError("Invalid endpoint: must contain 'inproc' or 'tcp'")
+
 
 class EventPublisherFactory:
     _registry: dict[str, Callable[..., EventPublisher]] = {
@@ -281,7 +337,9 @@ class EventPublisherFactory:
         cls._registry[name] = ctor
 
     @classmethod
-    def create(cls, config: Optional[KVEventsConfig]) -> EventPublisher:
+    def create(cls,
+               config: Optional[KVEventsConfig],
+               data_parallel_rank: int = 0) -> EventPublisher:
         """Create publisher from a config mapping."""
         if not config:
             return NullEventPublisher()
@@ -294,4 +352,5 @@ class EventPublisherFactory:
             constructor = cls._registry[kind]
         except KeyError as exc:
             raise ValueError(f"Unknown event publisher '{kind}'") from exc
-        return constructor(**config_dict)
+        return constructor(data_parallel_rank=data_parallel_rank,
+                           **config_dict)
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index e510a0626..32d03b311 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -80,7 +80,9 @@ class Scheduler(SchedulerInterface):
                 config=self.vllm_config, role=KVConnectorRole.SCHEDULER)
 
         self.kv_event_publisher = EventPublisherFactory.create(
-            self.kv_events_config)
+            self.kv_events_config,
+            vllm_config.parallel_config.data_parallel_rank,
+        )
 
         num_gpu_blocks = self.cache_config.num_gpu_blocks
         assert num_gpu_blocks is not None and num_gpu_blocks > 0
-- 
GitLab


From abd7df2fca570998693fa8c1ae39d83fb789ef27 Mon Sep 17 00:00:00 2001
From: Jiaxin Shan <seedjeffwan@gmail.com>
Date: Tue, 3 Jun 2025 17:15:18 -0700
Subject: [PATCH 154/274] [Misc] Fix path and python alias errors in
 disagg_prefill exmaples (#18919)

---
 .../disagg_prefill_lmcache_v1/disagg_example_nixl.sh        | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
index df8a41293..0b6c9213e 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
@@ -33,7 +33,7 @@ check_num_gpus() {
 
 ensure_python_library_installed() {
     echo "Checking if $1 is installed..."
-    python -c "import $1" > /dev/null 2>&1
+    python3 -c "import $1" > /dev/null 2>&1
     if [ $? -ne 0 ]; then
         if [ "$1" == "nixl" ]; then
             echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation."
@@ -121,8 +121,8 @@ main() {
     echo "All servers are up. Starting benchmark..."
 
     # begin benchmark
-    cd ../../../benchmarks/
-    python benchmark_serving.py --port 9000 --seed $(date +%s) \
+    cd ../../../../benchmarks/
+    python3 benchmark_serving.py --port 9000 --seed $(date +%s) \
         --model meta-llama/Llama-3.1-8B-Instruct \
         --dataset-name random --random-input-len 7500 --random-output-len 200 \
         --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
-- 
GitLab


From 52dceb172d6fe762bb60b670df61866fe86b6f17 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 3 Jun 2025 21:09:13 -0400
Subject: [PATCH 155/274] [Docs] Add developer doc about CI failures (#18782)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/contributing/ci-failures.md | 120 +++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 docs/contributing/ci-failures.md

diff --git a/docs/contributing/ci-failures.md b/docs/contributing/ci-failures.md
new file mode 100644
index 000000000..4d8f78197
--- /dev/null
+++ b/docs/contributing/ci-failures.md
@@ -0,0 +1,120 @@
+# CI Failures
+
+What should I do when a CI job fails on my PR, but I don't think my PR caused
+the failure?
+
+- Check the dashboard of current CI test failures:  
+  👉 [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20)
+
+- If your failure **is already listed**, it's likely unrelated to your PR.  
+  Help fixing it is always welcome!  
+    - Leave comments with links to additional instances of the failure.  
+    - React with a 👍 to signal how many are affected.
+
+- If your failure **is not listed**, you should **file an issue**.
+
+## Filing a CI Test Failure Issue
+
+- **File a bug report:**  
+    👉 [New CI Failure Report](https://github.com/vllm-project/vllm/issues/new?template=450-ci-failure.yml)
+
+- **Use this title format:**
+  
+    ```
+    [CI Failure]: failing-test-job - regex/matching/failing:test
+    ```
+
+- **For the environment field:**
+  
+    ```
+ Still failing on main as of commit abcdef123
+    ```
+
+- **In the description, include failing tests:**
+  
+    ```
+    FAILED failing/test.py:failing_test1 - Failure description  
+     FAILED failing/test.py:failing_test2 - Failure description  
+    https://github.com/orgs/vllm-project/projects/20  
+    https://github.com/vllm-project/vllm/issues/new?template=400-bug-report.yml  
+    FAILED failing/test.py:failing_test3 - Failure description  
+    ```
+
+- **Attach logs** (collapsible section example):
+    <details>
+    <summary>Logs:</summary>
+
+    ```text
+    ERROR 05-20 03:26:38 [dump_input.py:68] Dumping input data  
+    --- Logging error ---  
+    Traceback (most recent call last):  
+      File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 203, in execute_model  
+        return self.model_executor.execute_model(scheduler_output)  
+    ...
+    FAILED failing/test.py:failing_test1 - Failure description  
+    FAILED failing/test.py:failing_test2 - Failure description  
+    FAILED failing/test.py:failing_test3 - Failure description  
+    ```
+  
+    </details>
+
+## Logs Wrangling
+
+Download the full log file from Buildkite locally.
+
+Strip timestamps and colorization:
+
+```bash
+# Strip timestamps
+sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' ci.log
+
+# Strip colorization
+sed -i -r 's/\x1B\[[0-9;]*[mK]//g' ci.log
+```
+
+Use a tool for quick copy-pasting:
+
+```bash
+tail -525 ci_build.log | wl-copy
+```
+
+## Investigating a CI Test Failure
+
+1. Go to 👉 [Buildkite main branch](https://buildkite.com/vllm/ci/builds?branch=main)  
+2. Bisect to find the first build that shows the issue.  
+3. Add your findings to the GitHub issue.  
+4. If you find a strong candidate PR, mention it in the issue and ping contributors.
+
+## Reproducing a Failure
+
+CI test failures may be flaky. Use a bash loop to run repeatedly:
+
+```bash
+COUNT=1; while pytest -sv tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]; do  
+  COUNT=$[$COUNT + 1]; echo "RUN NUMBER ${COUNT}";  
+done
+```
+
+## Submitting a PR
+
+If you submit a PR to fix a CI failure:
+
+- Link the PR to the issue:  
+  Add `Closes #12345` to the PR description.
+- Add the `ci-failure` label:  
+  This helps track it in the [CI Failures GitHub Project](https://github.com/orgs/vllm-project/projects/20).
+
+## Other Resources
+
+- 🔍 [Test Reliability on `main`](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main&order=ASC&sort_by=reliability)
+- 🧪 [Latest Buildkite CI Runs](https://buildkite.com/vllm/ci/builds?branch=main)
+
+## Daily Triage
+
+Use [Buildkite analytics (2-day view)](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main&period=2days) to:
+
+- Identify recent test failures **on `main`**.
+- Exclude legitimate test failures on PRs.
+- (Optional) Ignore tests with 0% reliability.
+
+Compare to the [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20).
-- 
GitLab


From 4555143ea7fdd2b2f0106e40889bfbab49879237 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 4 Jun 2025 09:43:01 +0800
Subject: [PATCH 156/274] [CPU] V1 support for the CPU backend (#16441)

---
 .../scripts/hardware_ci/run-cpu-test.sh       |  13 +-
 docs/usage/v1_guide.md                        |   2 +
 requirements/cpu.txt                          |   3 +
 .../attention/test_attention_selector.py      |   5 +-
 .../models/language/generation/test_common.py |   1 -
 vllm/attention/backends/cpu_mla.py            |   6 +-
 vllm/attention/backends/torch_sdpa.py         |  16 +-
 vllm/compilation/wrapper.py                   |   7 +-
 vllm/engine/arg_utils.py                      |   4 +-
 vllm/platforms/cpu.py                         |  67 +++++--
 vllm/v1/attention/backends/cpu_attn.py        | 163 ++++++++++++++++++
 vllm/v1/worker/cpu_model_runner.py            |  86 +++++++++
 vllm/v1/worker/cpu_worker.py                  | 101 +++++++++++
 vllm/v1/worker/gpu_model_runner.py            |  28 +--
 vllm/v1/worker/gpu_worker.py                  |   3 +-
 15 files changed, 465 insertions(+), 40 deletions(-)
 create mode 100644 vllm/v1/attention/backends/cpu_attn.py
 create mode 100644 vllm/v1/worker/cpu_model_runner.py
 create mode 100644 vllm/v1/worker/cpu_worker.py

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 0a1193560..61aa7df13 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -6,6 +6,7 @@ set -ex
 
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 
 export CMAKE_BUILD_PARALLEL_LEVEL=32
@@ -23,10 +24,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
 
 function cpu_tests() {
   set -e
@@ -56,7 +55,7 @@ function cpu_tests() {
   # Run AWQ test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    pytest -s -v \
+    VLLM_USE_V1=0 pytest -s -v \
     tests/quantization/test_ipex_quant.py"
 
   # Run chunked-prefill and prefix-cache test
@@ -68,8 +67,6 @@ function cpu_tests() {
   # online serving
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    export VLLM_CPU_KVCACHE_SPACE=10 
-    export VLLM_CPU_OMP_THREADS_BIND=$1
     python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
     timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
     python3 benchmarks/benchmark_serving.py \
@@ -89,4 +86,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index a2321bf98..7c4909cb5 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -40,6 +40,8 @@ This living user guide outlines a few known **important changes and limitations*
 | **NVIDIA** | <nobr>🚀 Natively Supported</nobr>         |
 | **AMD**    | <nobr>🚧 WIP</nobr>           |
 | **TPU**    | <nobr>🚧 WIP</nobr>           |
+| **CPU**    | <nobr>🚧 WIP</nobr>           |
+
 #### Feature / Model
 
 | Feature / Model | Status |
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 121330158..e43b44397 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -1,6 +1,9 @@
 # Common dependencies
 -r common.txt
 
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2; python_version > '3.9'
+
 # Dependencies for CPUs
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 435fe6225..f3e641557 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -85,7 +85,10 @@ def test_env(
                        CpuPlatform()):
                 backend = get_attn_backend(16, torch.float16, torch.float16,
                                            block_size, False)
-            assert backend.get_name() == "TORCH_SDPA"
+            if use_v1:
+                assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
+            else:
+                assert backend.get_name() == "TORCH_SDPA"
 
         elif device == "hip":
             with patch("vllm.attention.selector.current_platform",
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index ed9e54722..f656f90c4 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -87,7 +87,6 @@ AITER_MODEL_LIST = [
         pytest.param("bigcode/starcoder2-3b"),  # starcoder2
         pytest.param(
             "TitanML/tiny-mixtral",  # mixtral
-            marks=[pytest.mark.cpu_model],
         )
     ])
 @pytest.mark.parametrize("max_tokens", [32])
diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py
index cf7883e12..793cb87b7 100644
--- a/vllm/attention/backends/cpu_mla.py
+++ b/vllm/attention/backends/cpu_mla.py
@@ -178,7 +178,7 @@ class CPUMLAMetadataBuilder(AttentionMetadataBuilder[CPUMLAMetadata]):
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=max_query_len,
             max_kv_len=max_kv_len,
-            query_start_loc=query_start_loc,
+            prefill_query_start_loc=query_start_loc,
             kv_start_loc=kv_start_loc,
             max_decode_seq_len=input_data.max_decode_seq_len,
             num_prefills=input_data.num_prefills,
@@ -264,8 +264,8 @@ class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]):
             key=k,
             value=v_padded,
             out=output,
-            seqlen_q=prefill_metadata.query_start_loc,
-            seqlen_k=prefill_metadata.query_start_loc,
+            seqlen_q=prefill_metadata.prefill_query_start_loc,
+            seqlen_k=prefill_metadata.prefill_query_start_loc,
             max_seqlen_q=prefill_metadata.max_query_len,
             max_seqlen_k=prefill_metadata.max_query_len,
             pdropout=0.0,
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index f3fb5adcf..23231c323 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -87,10 +87,13 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
     # For chunked prefill only
     max_query_len: Optional[int] = None
     max_kv_len: Optional[int] = None
-    query_start_loc: Optional[torch.Tensor] = None
+    prefill_query_start_loc: Optional[torch.Tensor] = None
     kv_start_loc: Optional[torch.Tensor] = None
     prefill_block_tables: Optional[torch.Tensor] = None
 
+    # For V1 logits index only
+    query_start_loc: Optional[torch.Tensor] = None
+
     # Begin encoder attn & enc/dec cross-attn fields...
     # Encoder sequence lengths representation
     encoder_seq_lens: Optional[List[int]] = None
@@ -375,7 +378,7 @@ class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]):
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=max_query_len,
             max_kv_len=max_kv_len,
-            query_start_loc=query_start_loc,
+            prefill_query_start_loc=query_start_loc,
             kv_start_loc=kv_start_loc,
             max_decode_seq_len=input_data.max_decode_seq_len,
             num_prefills=input_data.num_prefills,
@@ -470,6 +473,11 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+
+        # For warming-up
+        if attn_metadata is None:
+            return query
+
         attn_type = self.attn_type
         if (attn_type == AttentionType.ENCODER
                 and (not attn_metadata.is_all_encoder_attn_metadata_set)):
@@ -537,8 +545,8 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
 
         output = torch.empty_like(query)
         if prefill_meta := attn_metadata.prefill_metadata:
-            assert attn_metadata.seq_lens is not None
             if not prefill_meta.prefill_metadata.chunked_prefill:  # type: ignore
+                assert attn_metadata.seq_lens is not None
                 self._run_sdpa_forward(output,
                                        query,
                                        key,
@@ -555,7 +563,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
                     query[:prefill_meta.num_prefill_tokens, :, :],
                     key_cache,
                     value_cache,
-                    prefill_meta.query_start_loc,
+                    prefill_meta.prefill_query_start_loc,
                     prefill_meta.kv_start_loc,
                     prefill_meta.max_query_len,
                     prefill_meta.max_kv_len,
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 8c8d0b5cb..2a261c84c 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -41,11 +41,16 @@ class TorchCompileWrapperWithCustomDispatcher:
             # compiling the forward method
 
             backend = vllm_config.compilation_config.init_backend(vllm_config)
+            options = None
+            if isinstance(backend, str) and backend == "inductor":
+                options = get_current_vllm_config(
+                ).compilation_config.inductor_compile_config
 
             compiled_callable = torch.compile(
                 self.forward,
                 fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
-                backend=backend)
+                backend=backend,
+                options=options)
 
         self.compiled_callable = compiled_callable
         self.original_code_object = self.__class__.forward.__code__
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 2197d44ca..b1c4b27a0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1399,6 +1399,7 @@ class EngineArgs:
             "FLASHINFER",
             "FLASHINFER_VLLM_V1",
             "ROCM_AITER_MLA",
+            "TORCH_SDPA_VLLM_V1",
         ]
         if (envs.is_set("VLLM_ATTENTION_BACKEND")
                 and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
@@ -1431,7 +1432,8 @@ class EngineArgs:
 
         # Non-[CUDA, TPU] may be supported on V1, but off by default for now.
         v0_hardware = not any(
-            (current_platform.is_cuda(), current_platform.is_tpu()))
+            (current_platform.is_cuda(), current_platform.is_tpu(),
+             current_platform.is_cpu()))
         if v0_hardware and _warn_or_fallback(  # noqa: SIM103
                 current_platform.device_name):
             return False
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 2739f5c8c..265959d62 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -57,7 +57,10 @@ class CpuPlatform(Platform):
             logger.info("Using CPU MLA backend.")
             return "vllm.attention.backends.cpu_mla.CPUMLABackend"
         logger.info("Using Torch SDPA backend.")
-        return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
+        if use_v1:
+            return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
+        else:
+            return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
@@ -81,6 +84,8 @@ class CpuPlatform(Platform):
         if not model_config.enforce_eager:
             model_config.enforce_eager = True
 
+        model_config.disable_cascade_attn = True
+
         cache_config = vllm_config.cache_config
 
         ipex_available = find_spec("intel_extension_for_pytorch") is not None
@@ -128,7 +133,8 @@ class CpuPlatform(Platform):
                 f" {kv_cache_space}, expect a positive integer value.")
 
         parallel_config = vllm_config.parallel_config
-        if (parallel_config.distributed_executor_backend is not None
+        if (parallel_config.world_size > 1
+                and parallel_config.distributed_executor_backend is not None
                 and parallel_config.distributed_executor_backend != "mp"):
             logger.warning(("%s is not supported on CPU, fallback to mp "
                             "distributed executor backend."),
@@ -141,7 +147,38 @@ class CpuPlatform(Platform):
                 parallel_config.sd_worker_cls = \
                     "vllm.worker.cpu_worker.CPUWorker"
             else:
-                parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                        "vllm.v1.worker.cpu_worker.CPUWorker"
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.cpu_worker.CPUWorker"
+
+        # Note: workaround for v1 gpu_model_runner
+        from vllm.config import CompilationLevel
+        vllm_config.compilation_config.cudagraph_capture_sizes = []
+
+        compilation_config = vllm_config.compilation_config
+        if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level
+                == CompilationLevel.PIECEWISE):
+            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+            compilation_config.backend = "eager"
+            compilation_config.custom_ops += ["none"]
+            compilation_config.inductor_compile_config.update({
+                "dce":
+                True,
+                "size_asserts":
+                False,
+                "nan_asserts":
+                False,
+                "memory_planning":
+                True,
+                "epilogue_fusion":
+                True,
+            })
+
+        if vllm_config.lora_config is not None:
+            compilation_config.level = CompilationLevel.NO_COMPILATION
 
         assert vllm_config.device_config.device_type == "cpu"
 
@@ -149,6 +186,12 @@ class CpuPlatform(Platform):
         # Environment variables for CPU executor
         #
 
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+        # Note: to avoid the error 'nthreads cannot be larger than environment
+        #  variable "NUMEXPR_MAX_THREADS" (64)'.
+        os.environ["NUMEXPR_MAX_THREADS"] = str(len(os.sched_getaffinity(0)))
+
         # Set default threads num for OpenMP parallel
         os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())
 
@@ -171,13 +214,6 @@ class CpuPlatform(Platform):
         # To hint IPEX uses shared memory based AllReduce
         os.environ["LOCAL_WORLD_SIZE"] = str(
             vllm_config.parallel_config.tensor_parallel_size)
-        if sys.platform == "darwin" and \
-                envs.VLLM_WORKER_MULTIPROC_METHOD == "fork":
-            if os.environ.get('VLLM_WORKER_MULTIPROC_METHOD', None) is None:
-                logger.warning(
-                    "Default to spawn method on MacOS. If this is not desired,"
-                    " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.")
-                os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
 
         if vllm_config.model_config and vllm_config.model_config.use_mla:
             logger.info(
@@ -204,3 +240,14 @@ class CpuPlatform(Platform):
         Get device specific communicator class for distributed communication.
         """
         return "vllm.distributed.device_communicators.cpu_communicator.CpuCommunicator"  # noqa
+
+    @classmethod
+    def supports_structured_output(cls) -> bool:
+        return True
+
+    @classmethod
+    def supports_v1(cls, model_config) -> bool:
+        """Returns whether the current platform can support v1 for the supplied
+        model configuration.
+        """
+        return True
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
new file mode 100644
index 000000000..d7a580c28
--- /dev/null
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import torch
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.backends.torch_sdpa import (TorchSDPABackendImpl,
+                                                TorchSDPAMetadata)
+from vllm.attention.backends.utils import CommonAttentionState
+from vllm.attention.ops.ipex_attn import PagedAttention
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.worker.block_table import BlockTable
+from vllm.v1.worker.cpu_model_runner import CPUModelRunner
+from vllm.v1.worker.gpu_input_batch import InputBatch
+
+
+class TorchSDPABackend:
+    accept_output_buffer: bool = False
+
+    @staticmethod
+    def get_name() -> str:
+        return "TORCH_SDPA_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["TorchSDPABackendImpl"]:
+        return TorchSDPABackendImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return TorchSDPAMetadata
+
+    @staticmethod
+    def get_state_cls() -> type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_builder_cls() -> type["TorchSDPAMetadataBuilderV1"]:
+        return TorchSDPAMetadataBuilderV1
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
+                                                 num_kv_heads, head_size)
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+
+class TorchSDPAMetadataBuilderV1:
+
+    def __init__(self, runner: CPUModelRunner, kv_cache_spec: AttentionSpec,
+                 block_table: BlockTable) -> None:
+        self.runner = runner
+        self.block_table = block_table
+
+        # For reorder
+        self.reorder_prompt_req_index_list = np.empty(self.runner.max_num_reqs,
+                                                      dtype=np.int64)
+        self.reorder_decode_req_index_list = np.empty(self.runner.max_num_reqs,
+                                                      dtype=np.int64)
+        self.num_prompt_req: int = 0
+
+        self.seq_start_loc_cpu = torch.zeros(
+            runner.max_num_reqs + 1,
+            dtype=torch.int32,
+            device="cpu",
+        )
+        self.seq_start_loc_np = self.seq_start_loc_cpu.numpy()
+
+    def reorder_batch(self, input_batch: InputBatch,
+                      scheduler_output: SchedulerOutput) -> bool:
+        prompt_list_idx = 0
+        decode_list_idx = 0
+        for req_index in range(input_batch.num_reqs):
+            if input_batch.num_computed_tokens_cpu[
+                    req_index] < input_batch.num_prompt_tokens[req_index]:
+                # prompt stage
+                self.reorder_prompt_req_index_list[prompt_list_idx] = req_index
+                prompt_list_idx += 1
+            else:
+                # decode stage
+                self.reorder_decode_req_index_list[decode_list_idx] = req_index
+                decode_list_idx += 1
+        assert decode_list_idx + prompt_list_idx == input_batch.num_reqs
+
+        # Update prompt requests number
+        self.num_prompt_req = prompt_list_idx
+
+        reorder_req_num = 0
+        for req_index in range(decode_list_idx):
+            if self.reorder_decode_req_index_list[req_index] < prompt_list_idx:
+                reorder_req_num += 1
+            else:
+                break
+
+        if reorder_req_num == 0:
+            return False
+
+        reorder_prompt_list = (
+            self.reorder_prompt_req_index_list[:prompt_list_idx]
+            [-reorder_req_num:])
+        reorder_decode_list = (
+            self.reorder_decode_req_index_list[:decode_list_idx]
+            [:reorder_req_num])
+        assert reorder_decode_list.size == reorder_prompt_list.size
+
+        for idx in range(reorder_req_num):
+            prompt_req_index = reorder_prompt_list[idx].item()
+            decode_req_index = reorder_decode_list[idx].item()
+            input_batch.swap_states(prompt_req_index, decode_req_index)
+
+        return True
+
+    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata):
+        runner = self.runner
+        block_table = self.block_table
+        seq_lens_np = runner.seq_lens_np[:num_reqs]
+        num_prompt_req = self.num_prompt_req
+        max_prefill_seq_len = seq_lens_np[:num_prompt_req].max().item(
+        ) if num_prompt_req > 0 else 0
+        max_decode_seq_len = seq_lens_np[num_prompt_req:num_reqs].max().item(
+        ) if num_prompt_req < num_reqs else 0
+        self.seq_start_loc_np[0] = 0
+        np.cumsum(seq_lens_np, out=self.seq_start_loc_np[1:num_reqs + 1])
+        num_prefill_tokens = runner.query_start_loc_np[num_prompt_req].item()
+        num_decode_tokens = runner.query_start_loc_np[num_reqs].item(
+        ) - num_prefill_tokens
+        slot_mapping = block_table.slot_mapping_cpu[:num_actual_tokens].long()
+        block_table_tensor = block_table.get_device_tensor()
+        attn_metadata = TorchSDPAMetadata(
+            num_prefills=num_prompt_req,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            slot_mapping=slot_mapping,
+            seq_lens_tensor=runner.
+            seq_lens_cpu[num_prompt_req:num_reqs],  # decode
+            max_decode_seq_len=max_decode_seq_len,  # decode
+            block_tables=block_table_tensor[num_prompt_req:num_reqs],  # decode
+            chunked_prefill=True,
+            max_query_len=max_query_len,
+            max_kv_len=max_prefill_seq_len,
+            prefill_query_start_loc=runner.
+            query_start_loc_cpu[:num_prompt_req + 1],  # prefill
+            kv_start_loc=self.seq_start_loc_cpu[:num_prompt_req +
+                                                1],  # prefill
+            prefill_block_tables=block_table_tensor[:
+                                                    num_prompt_req],  # prefill
+            query_start_loc=runner.query_start_loc_cpu[:num_reqs +
+                                                       1],  # for logits index
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
+        )
+
+        return attn_metadata
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
new file mode 100644
index 000000000..607cfc0ef
--- /dev/null
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+from contextlib import contextmanager
+from typing import Any
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model
+from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
+logger = init_logger(__name__)
+
+
+class CPUModelRunner(GPUModelRunner):
+
+    def __init__(self, vllm_config: VllmConfig, device: torch.device):
+        super().__init__(vllm_config, device)
+
+        assert device == torch.device("cpu")
+        assert self.speculative_config is None, "spec decode is not supported."
+
+        self.use_cuda_graph = False
+        self.cascade_attn_enabled = False
+
+        self._postprocess_tenosrs()
+
+    def _postprocess_tenosrs(self) -> None:
+        # Note: replace device tensors with cpu tensors
+        def replace_tensor(obj: Any, cpu_attr_name: str,
+                           device_attr_name) -> None:
+            cpu_tensor = getattr(obj, cpu_attr_name, None)
+            device_tensor = getattr(obj, device_attr_name, None)
+            if cpu_tensor is not None and device_tensor is not None:
+                assert isinstance(cpu_tensor, torch.Tensor)
+                assert isinstance(device_tensor, torch.Tensor)
+                setattr(obj, device_attr_name, cpu_tensor)
+
+        for k, v in vars(self).items():
+            if k.endswith("_cpu") and isinstance(v, torch.Tensor):
+                replace_tensor(self, k, k[:-4])
+
+        for k, v in vars(self.input_batch).items():
+            if k.endswith("_cpu_tensor") and isinstance(v, torch.Tensor):
+                replace_tensor(self.input_batch, k, k[:-11])
+
+        for k, v in vars(self.input_batch.block_table).items():
+            if k.endswith("_cpu") and isinstance(v, torch.Tensor):
+                replace_tensor(self.input_batch.block_table, k, k[:-4])
+
+    def load_model(self) -> None:
+        logger.info("Starting to load model %s...", self.model_config.model)
+        self.model = get_model(vllm_config=self.vllm_config)
+
+        if self.lora_config:
+            self.model = self.load_lora_model(self.model, self.model_config,
+                                              self.scheduler_config,
+                                              self.lora_config, self.device)
+
+    def warming_up_model(self) -> None:
+        logger.info("Warming up model for the compilation...")
+        # Only generate graph for the generic shape
+        self._dummy_run(max(16, self.max_num_reqs))
+        logger.info("Warming up done.")
+
+    def _init_device_properties(self) -> None:
+        pass
+
+    def _sync_device(self) -> None:
+        pass
+
+
+@contextmanager
+def _set_global_compilation_settings():
+    import torch._inductor.config
+
+    # Note: The CPPGEMM backend requires freezing parameters.
+    freezing_value = torch._inductor.config.freezing
+    torch._inductor.config.freezing = True
+    # Note: workaround for "ValueError: fast mode: can't pickle cyclic objects
+    # including object type dict"
+    force_disable_caches = torch._inductor.config.force_disable_caches
+    torch._inductor.config.force_disable_caches = True
+    yield
+    torch._inductor.config.freezing = freezing_value
+    torch._inductor.config.force_disable_caches = force_disable_caches
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
new file mode 100644
index 000000000..0b710b7bc
--- /dev/null
+++ b/vllm/v1/worker/cpu_worker.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import Optional
+
+import torch
+
+from vllm import envs
+from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_pp_group, get_tp_group
+from vllm.logger import init_logger
+from vllm.model_executor.utils import set_random_seed
+from vllm.sequence import IntermediateTensors
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.worker.cpu_model_runner import CPUModelRunner
+from vllm.v1.worker.gpu_worker import (Worker,
+                                       init_worker_distributed_environment)
+
+logger = init_logger(__name__)
+
+
+class CPUWorker(Worker):
+
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 local_rank: int,
+                 rank: int,
+                 distributed_init_method: str,
+                 is_driver_worker: bool = False):
+        super().__init__(vllm_config,
+                         local_rank,
+                         rank,
+                         distributed_init_method,
+                         is_driver_worker=is_driver_worker)
+
+        self.parallel_config.disable_custom_all_reduce = True
+
+    def init_device(self):
+        # Setup OpenMP threads affinity.
+        omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
+        if omp_cpuids == "all":
+            self.local_omp_cpuid = "all"
+        else:
+            self.local_omp_cpuid = omp_cpuids.split("|")[self.rank]
+            ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
+            if ret:
+                logger.info(ret)
+
+        # Note: unique identifier for creating allreduce shared memory
+        os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split(
+            ":")[-1]
+        # Initialize the distributed environment.
+        init_worker_distributed_environment(self.vllm_config, self.rank,
+                                            self.distributed_init_method,
+                                            self.local_rank, "gloo")
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+        # Construct the model runner
+        self.model_runner: CPUModelRunner = CPUModelRunner(
+            self.vllm_config, torch.device("cpu"))
+
+    def sleep(self, level: int = 1) -> None:
+        logger.warning("sleep mode is not supported on CPU, ignore it.")
+        pass
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        logger.warning("sleep mode is not supported on CPU, ignore it.")
+        pass
+
+    def determine_available_memory(self) -> int:
+        return self.cache_config.cpu_kvcache_space_bytes  # type: ignore
+
+    def compile_or_warm_up_model(self) -> None:
+        # Reset the seed to ensure that the random state is not affected by
+        # the model initialization and profiling.
+        set_random_seed(self.model_config.seed)
+        self.model_runner.warming_up_model()
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> Optional[ModelRunnerOutput]:
+        intermediate_tensors = None
+        if not get_pp_group().is_first_rank:
+            intermediate_tensors = IntermediateTensors(
+                get_pp_group().recv_tensor_dict(
+                    all_gather_group=get_tp_group()))
+
+        output = self.model_runner.execute_model(scheduler_output,
+                                                 intermediate_tensors)
+
+        if not get_pp_group().is_last_rank:
+            assert isinstance(output, IntermediateTensors)
+            get_pp_group().send_tensor_dict(output.tensors,
+                                            all_gather_group=get_tp_group())
+            return None
+
+        assert isinstance(output, ModelRunnerOutput)
+        return output if self.is_driver_worker else None
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6a566a602..6ea6bb020 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -5,7 +5,7 @@ import copy
 import gc
 import time
 import weakref
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 import numpy as np
 import torch
@@ -38,7 +38,6 @@ from vllm.sequence import IntermediateTensors
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         GiB_bytes, LazyLoader, async_tensor_h2d, cdiv,
                         check_use_alibi, is_pin_memory_available)
-from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
@@ -203,8 +202,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 self.vllm_config.compilation_config.cudagraph_capture_sizes))
 
         # Cache the device properties.
-        self.device_properties = torch.cuda.get_device_properties(self.device)
-        self.num_sms = self.device_properties.multi_processor_count
+        self._init_device_properties()
 
         # Persistent buffers for CUDA graphs.
         self.input_ids = torch.zeros(self.max_num_tokens,
@@ -315,6 +313,17 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 self.input_batch, scheduler_output)
         return batch_reordered
 
+    # Note: used for model runner override.
+    def _init_device_properties(self) -> None:
+        """Initialize attributes from torch.cuda.get_device_properties
+        """
+        self.device_properties = torch.cuda.get_device_properties(self.device)
+        self.num_sms = self.device_properties.multi_processor_count
+
+    # Note: used for model runner override.
+    def _sync_device(self) -> None:
+        torch.cuda.synchronize()
+
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         """Update the cached states and the persistent batch with the scheduler
         output.
@@ -538,8 +547,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> tuple[dict[str, FlashAttentionMetadata], torch.Tensor,
-               Optional[SpecDecodeMetadata]]:
+    ) -> tuple[dict[str, Any], torch.Tensor, Optional[SpecDecodeMetadata]]:
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
         num_reqs = self.input_batch.num_reqs
@@ -652,7 +660,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         common_attn_metadata = CommonAttentionMetadata(
             query_start_loc=query_start_loc, seq_lens=seq_lens)
 
-        attn_metadata: dict[str, FlashAttentionMetadata] = {}
+        attn_metadata: dict[str, Any] = {}
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
         for kv_cache_group_id, kv_cache_group_spec in enumerate(
@@ -1710,7 +1718,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
         # Must synchronize the non-blocking GPU->CPU transfers.
         if prompt_logprobs_dict:
-            torch.cuda.synchronize()
+            self._sync_device()
 
         return prompt_logprobs_dict
 
@@ -1740,7 +1748,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                                         dtype=np.int32)
 
         if skip_attn:
-            attn_metadata: Optional[dict[str, FlashAttentionMetadata]] = None
+            attn_metadata: Optional[dict[str, Any]] = None
         else:
             query_start_loc = self.query_start_loc[:num_reqs + 1]
             seq_lens = self.seq_lens[:num_reqs]
@@ -1964,7 +1972,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             sampler_output = self._dummy_sampler_run(hidden_states)
         else:
             sampler_output = None
-        torch.cuda.synchronize()
+        self._sync_device()
         del hidden_states, sampler_output
         self.encoder_cache.clear()
         gc.collect()
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index f36cf5d5c..3bf3b2221 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -342,13 +342,14 @@ def init_worker_distributed_environment(
     rank: int,
     distributed_init_method: Optional[str] = None,
     local_rank: int = -1,
+    backend: str = "nccl",
 ) -> None:
     """Initialize the distributed environment."""
     parallel_config = vllm_config.parallel_config
     set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
 
     init_distributed_environment(parallel_config.world_size, rank,
-                                 distributed_init_method, local_rank)
+                                 distributed_init_method, local_rank, backend)
 
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
-- 
GitLab


From 1409ef913446aa282f6426efbb0ed02a59320467 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Wed, 4 Jun 2025 04:24:56 +0100
Subject: [PATCH 157/274] [Core] Cast multimodal input in hf processor (#18862)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 vllm/inputs/registry.py                       | 26 +++++++++++++++++--
 vllm/multimodal/inputs.py                     |  8 +-----
 vllm/spec_decode/draft_model_runner.py        |  1 -
 vllm/v1/worker/gpu_model_runner.py            |  2 --
 vllm/v1/worker/tpu_model_runner.py            |  2 --
 vllm/worker/cpu_enc_dec_model_runner.py       |  1 -
 vllm/worker/cpu_model_runner.py               |  1 -
 vllm/worker/cpu_pooling_model_runner.py       |  1 -
 vllm/worker/enc_dec_model_runner.py           |  1 -
 vllm/worker/model_runner.py                   |  1 -
 vllm/worker/multi_step_neuron_model_runner.py |  1 -
 ...i_step_neuronx_distributed_model_runner.py |  1 -
 vllm/worker/neuron_model_runner.py            |  2 --
 vllm/worker/pooling_model_runner.py           |  1 -
 vllm/worker/xpu_model_runner.py               |  1 -
 15 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 73d19aecd..3dad021e3 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -4,9 +4,12 @@ from collections.abc import Mapping
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
 
+import torch
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 from typing_extensions import TypeVar
 
+from vllm.jsontree import JSONTree, json_map_leaves
+from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_processor_from_config
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import resolve_mm_processor_kwargs
@@ -21,6 +24,8 @@ _T = TypeVar("_T")
 _C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
 _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
 
+logger = init_logger(__name__)
+
 
 @dataclass(frozen=True)
 class InputContext:
@@ -134,7 +139,7 @@ class InputProcessingContext(InputContext):
         hf_processor: ProcessorMixin,
         data: Mapping[str, object],
         kwargs: Mapping[str, object] = {},
-    ) -> BatchFeature:
+    ) -> Union[BatchFeature, JSONTree]:
         """
         Call `hf_processor` on the prompt `data`
         (text, image, audio...) with configurable options `kwargs`.
@@ -154,8 +159,25 @@ class InputProcessingContext(InputContext):
             allow_var_kwargs=True,
         )
 
+        def maybe_cast_dtype(x):
+            # This mimics the behavior of transformers.BatchFeature
+            if isinstance(x, torch.Tensor) and x.is_floating_point():
+                return x.to(dtype=self.model_config.dtype)
+            return x
+
         try:
-            return hf_processor(**data, **merged_kwargs, return_tensors="pt")
+            output = hf_processor(**data, **merged_kwargs, return_tensors="pt")
+            # this emulates output.to(dtype=self.model_config.dtype)
+            cast_output = json_map_leaves(maybe_cast_dtype, output)
+            if isinstance(output, BatchFeature):
+                return BatchFeature(cast_output)
+
+            logger.warning_once(
+                f"{type(hf_processor).__name__} did not return `BatchFeature`. "
+                "Make sure to match the behaviour of `ProcessorMixin` when "
+                "implementing custom processors.")
+            return cast_output
+
         except Exception as exc:
             msg = (f"Failed to apply {type(hf_processor).__name__} "
                    f"on data={data} with kwargs={merged_kwargs}")
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 35d2a6e8c..0bf5b1cf1 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -747,17 +747,11 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
         batched_inputs: BatchedTensorInputs,
         *,
         device: torch.types.Device,
-        dtype: Optional[torch.dtype] = None,
     ) -> BatchedTensorInputs:
         json_inputs = cast(JSONTree[torch.Tensor], batched_inputs)
 
-        def maybe_cast_dtype(x: torch.Tensor):
-            # This mimics the behavior of transformers.BatchFeature
-            return x.to(dtype=dtype) if x.is_floating_point() else x
-
         json_mapped = json_map_leaves(
-            # NOTE: Cast the dtype before sending it to device
-            lambda x: maybe_cast_dtype(x).to(device=device, non_blocking=True),
+            lambda x: x.to(device=device, non_blocking=True),
             json_inputs,
         )
 
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 8ccfefea1..96646ec94 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -297,7 +297,6 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
                     intermediate_tensors=intermediate_tensors,
                     **MultiModalKwargs.as_kwargs(
                         multi_modal_kwargs,
-                        dtype=self.model_runner.model_config.dtype,
                         device=self.device,
                     ),
                     **model_execute_kwargs,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6ea6bb020..9ac33a149 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -957,7 +957,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
             batched_mm_inputs = MultiModalKwargs.as_kwargs(
                 batched_mm_inputs,
-                dtype=self.model_config.dtype,
                 device=self.device,
             )
 
@@ -1951,7 +1950,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 [dummy_mm_kwargs] * max_num_mm_items)
             batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
                 batched_dummy_mm_inputs,
-                dtype=self.model_config.dtype,
                 device=self.device,
             )
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 73c445d14..94e438fb4 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -718,7 +718,6 @@ class TPUModelRunner(LoRAModelRunnerMixin):
             batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
             batched_mm_inputs = MultiModalKwargs.as_kwargs(
                 batched_mm_inputs,
-                dtype=self.model_config.dtype,
                 device=self.device,
             )
 
@@ -1560,7 +1559,6 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                                                          batch_size)
         return MultiModalKwargs.as_kwargs(
             batched_dummy_mm_inputs,
-            dtype=self.model_config.dtype,
             device=self.device,
         )
 
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index 677d66357..c99e2652a 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -300,7 +300,6 @@ class CPUEncoderDecoderModelRunner(
             model_input.encoder_input_positions,
             **MultiModalKwargs.as_kwargs(
                 model_input.multi_modal_kwargs or {},
-                dtype=self.model_config.dtype,
                 device=self.device,
             ),
             "intermediate_tensors":
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 6213cf760..68cdf65ca 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -630,7 +630,6 @@ class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]):
         if model_input.multi_modal_kwargs is not None:
             multimodal_kwargs = MultiModalKwargs.as_kwargs(
                 model_input.multi_modal_kwargs,
-                dtype=self.model_config.dtype,
                 device=self.device,
             )
         execute_model_kwargs = {}
diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py
index 174f86f48..203fdf225 100644
--- a/vllm/worker/cpu_pooling_model_runner.py
+++ b/vllm/worker/cpu_pooling_model_runner.py
@@ -53,7 +53,6 @@ class CPUPoolingModelRunner(
             model_input.input_positions,
             **MultiModalKwargs.as_kwargs(
                 model_input.multi_modal_kwargs or {},
-                dtype=self.model_config.dtype,
                 device=self.device,
             ),
             **cross_enc_kwargs,
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index a3e7b0147..8d92edc5b 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -205,7 +205,6 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
                 intermediate_tensors=intermediate_tensors,
                 **MultiModalKwargs.as_kwargs(
                     multi_modal_kwargs,
-                    dtype=self.model_config.dtype,
                     device=self.device,
                 ),
                 **seqlen_agnostic_kwargs,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 75501e0f7..82db6617b 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1848,7 +1848,6 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
                     intermediate_tensors=intermediate_tensors,
                     **MultiModalKwargs.as_kwargs(
                         multi_modal_kwargs,
-                        dtype=self.model_config.dtype,
                         device=self.device,
                     ),
                     **seqlen_agnostic_kwargs,
diff --git a/vllm/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py
index 336e41649..25f588077 100644
--- a/vllm/worker/multi_step_neuron_model_runner.py
+++ b/vllm/worker/multi_step_neuron_model_runner.py
@@ -73,7 +73,6 @@ class MultiStepNeuronModelRunner(NeuronModelRunner):
             input_block_ids=model_input.input_block_ids,
             **MultiModalKwargs.as_kwargs(
                 model_input.multi_modal_kwargs or {},
-                dtype=self.model_config.dtype,
                 device=self.device,
             ),
         )
diff --git a/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py
index de9827723..dd521dd67 100644
--- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py
+++ b/vllm/worker/multi_step_neuronx_distributed_model_runner.py
@@ -52,7 +52,6 @@ class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner):
             sampling_params=sampling_params,
             **MultiModalKwargs.as_kwargs(
                 model_input.multi_modal_kwargs or {},
-                dtype=self.model_config.dtype,
                 device=self.device,
             ),
         )
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 28855bb46..7ccf1a2c0 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -395,7 +395,6 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
                 adapter_ids=model_input.adapter_ids,
                 **MultiModalKwargs.as_kwargs(
                     model_input.multi_modal_kwargs or {},
-                    dtype=self.model_config.dtype,
                     device=self.device,
                 ),
             )
@@ -408,7 +407,6 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
                 input_block_ids=model_input.input_block_ids,
                 **MultiModalKwargs.as_kwargs(
                     model_input.multi_modal_kwargs or {},
-                    dtype=self.model_config.dtype,
                     device=self.device,
                 ),
             )
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index be6b3d137..f80955f71 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -122,7 +122,6 @@ class PoolingModelRunner(
                 intermediate_tensors=intermediate_tensors,
                 **MultiModalKwargs.as_kwargs(
                     multi_modal_kwargs,
-                    dtype=self.model_config.dtype,
                     device=self.device,
                 ),
                 **cross_enc_kwargs,
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index ecbb63d91..b2d3ce852 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -565,7 +565,6 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
                 intermediate_tensors=intermediate_tensors,
                 **MultiModalKwargs.as_kwargs(
                     model_input.multi_modal_kwargs or {},
-                    dtype=self.model_config.dtype,
                     device=self.device,
                 ),
             )
-- 
GitLab


From 5d6d1adf15aca59cb135853d0f11308af4bbd6e3 Mon Sep 17 00:00:00 2001
From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Date: Wed, 4 Jun 2025 08:13:01 +0400
Subject: [PATCH 158/274] [KERNEL] Sampler. CUDA kernel for applying repetition
 penalty (#18437)

---
 CMakeLists.txt                                |  1 +
 csrc/ops.h                                    |  5 ++
 csrc/sampler.cu                               | 86 +++++++++++++++++++
 csrc/torch_bindings.cpp                       |  7 ++
 .../test_apply_repetition_penalties.py        | 76 ++++++++++++++++
 vllm/_custom_ops.py                           | 39 +++++++++
 vllm/model_executor/layers/utils.py           | 13 +--
 7 files changed, 218 insertions(+), 9 deletions(-)
 create mode 100644 csrc/sampler.cu
 create mode 100644 tests/kernels/test_apply_repetition_penalties.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 87aa23c08..f11d28590 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -242,6 +242,7 @@ set(VLLM_EXT_SRC
   "csrc/activation_kernels.cu"
   "csrc/layernorm_kernels.cu"
   "csrc/layernorm_quant_kernels.cu"
+  "csrc/sampler.cu"
   "csrc/cuda_view.cu"
   "csrc/quantization/gptq/q_gemm.cu"
   "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
diff --git a/csrc/ops.h b/csrc/ops.h
index 7044b4588..297f32b4a 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -92,6 +92,11 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                         torch::Tensor& weight, double epsilon);
 
+void apply_repetition_penalties_(torch::Tensor& logits,
+                                 const torch::Tensor& prompt_mask,
+                                 const torch::Tensor& output_mask,
+                                 const torch::Tensor& repetition_penalties);
+
 void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                                torch::Tensor& weight, torch::Tensor& scale,
                                double epsilon);
diff --git a/csrc/sampler.cu b/csrc/sampler.cu
new file mode 100644
index 000000000..ee5793dda
--- /dev/null
+++ b/csrc/sampler.cu
@@ -0,0 +1,86 @@
+#include "dispatch_utils.h"
+
+#include <torch/cuda.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+#endif
+
+namespace vllm {
+
+template <typename scalar_t>
+__global__ void apply_repetition_penalties_kernel(
+    scalar_t* __restrict__ logits,         // [num_seqs, vocab_size]
+    const bool* __restrict__ prompt_mask,  // [num_seqs, vocab_size]
+    const bool* __restrict__ output_mask,  // [num_seqs, vocab_size]
+    const scalar_t* __restrict__ repetition_penalties,  // [num_seqs]
+    const int num_seqs, const int vocab_size, const int tile_size) {
+  // Each block handles one sequence and a tile of vocab
+  const int seq_idx = blockIdx.x;
+  if (seq_idx >= num_seqs) return;
+
+  const int tile_start = blockIdx.y * tile_size;
+  const int tile_end = min(tile_start + tile_size, vocab_size);
+
+  // Load repetition penalty for this sequence
+  const scalar_t penalty = repetition_penalties[seq_idx];
+
+  // Each thread processes multiple vocab items within the tile
+  for (int vocab_idx = tile_start + threadIdx.x; vocab_idx < tile_end;
+       vocab_idx += blockDim.x) {
+    const int64_t idx = static_cast<int64_t>(seq_idx) * vocab_size + vocab_idx;
+    const bool is_repeated = prompt_mask[idx] || output_mask[idx];
+    if (is_repeated) {
+      scalar_t logit = logits[idx];
+      if (logit > 0) {
+        logits[idx] = logit / penalty;
+      } else {
+        logits[idx] = logit * penalty;
+      }
+    }
+  }
+}
+
+}  // namespace vllm
+
+void apply_repetition_penalties_(
+    torch::Tensor& logits,             // [num_seqs, vocab_size], in-place
+    const torch::Tensor& prompt_mask,  // [num_seqs, vocab_size]
+    const torch::Tensor& output_mask,  // [num_seqs, vocab_size]
+    const torch::Tensor& repetition_penalties) {  // [num_seqs]
+  TORCH_CHECK(logits.is_contiguous());
+  TORCH_CHECK(prompt_mask.is_contiguous());
+  TORCH_CHECK(output_mask.is_contiguous());
+  TORCH_CHECK(repetition_penalties.is_contiguous());
+
+  int vocab_size = logits.size(-1);
+  int num_seqs = logits.size(0);
+
+  // Get number of SMs on the current device
+  int sms = 0;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount,
+                         logits.get_device());
+
+  // Compute tile_num and tile_size
+  int tile_num =
+      std::min(vocab_size, std::max(1, (sms + num_seqs - 1) / num_seqs));
+  int tile_size = (vocab_size + tile_num - 1) / tile_num;
+
+  // Each block handles one sequence and a tile of vocab
+  dim3 grid(num_seqs, tile_num);
+  dim3 block(std::min(tile_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(logits));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+      logits.scalar_type(), "apply_repetition_penalties_kernel", [&] {
+        vllm::apply_repetition_penalties_kernel<scalar_t>
+            <<<grid, block, 0, stream>>>(
+                logits.data_ptr<scalar_t>(), prompt_mask.data_ptr<bool>(),
+                output_mask.data_ptr<bool>(),
+                repetition_penalties.data_ptr<scalar_t>(), num_seqs, vocab_size,
+                tile_size);
+      });
+}
\ No newline at end of file
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 371894c56..3fffaf290 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -170,6 +170,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "float epsilon) -> ()");
   ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
 
+  // Apply repetition penalties to logits in-place
+  ops.def(
+      "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
+      "Tensor output_mask, Tensor repetition_penalties) -> ()");
+  ops.impl("apply_repetition_penalties_", torch::kCUDA,
+           &apply_repetition_penalties_);
+
   // Layernorm-quant
   // Apply Root Mean Square (RMS) Normalization to the input tensor.
   ops.def(
diff --git a/tests/kernels/test_apply_repetition_penalties.py b/tests/kernels/test_apply_repetition_penalties.py
new file mode 100644
index 000000000..9115949a1
--- /dev/null
+++ b/tests/kernels/test_apply_repetition_penalties.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm._custom_ops import (apply_repetition_penalties_cuda,
+                              apply_repetition_penalties_torch)
+from vllm.platforms import current_platform
+
+NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025]
+# [stress, stress, stress, Qwen, llama 4]
+VOCAB_SIZES = [17, 256, 1019, 151936, 202048]
+REPETITION_PENALTY_VALUES = [1.05]
+SEEDS = [0]
+DTYPES = [torch.float32, torch.float16]
+
+
+@pytest.mark.parametrize("num_seqs", NUM_SEQS)
+@pytest.mark.parametrize("vocab_size", VOCAB_SIZES)
+@pytest.mark.parametrize("repetition_penalty", REPETITION_PENALTY_VALUES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test for checking CUDA kernel")
+@torch.inference_mode()
+def test_apply_repetition_penalties(
+    num_seqs: int,
+    vocab_size: int,
+    repetition_penalty: float,
+    dtype: torch.dtype,
+    seed: int,
+) -> None:
+    """
+    Test the apply_repetition_penalties custom op 
+    against a reference implementation.
+    """
+    current_platform.seed_everything(seed)
+    torch.set_default_device("cuda:0")
+
+    # Create test data
+    logits = torch.randn(num_seqs, vocab_size, dtype=dtype)
+
+    # Create masks with some random tokens marked as repeated
+    prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+    output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+
+    # Mark some tokens as repeated in prompt and output
+    prompt_indices = torch.randint(0, vocab_size,
+                                   (num_seqs, max(1, vocab_size // 200)))
+    output_indices = torch.randint(0, vocab_size,
+                                   (num_seqs, max(1, vocab_size // 200)))
+
+    for i in range(num_seqs):
+        prompt_mask[i, prompt_indices[i]] = True
+        output_mask[i, output_indices[i]] = True
+
+    # Create repetition penalties tensor
+    repetition_penalties = torch.full((num_seqs, ),
+                                      repetition_penalty,
+                                      dtype=dtype)
+
+    # Run all three implementations
+    logits_torch = logits.clone()
+    logits_cuda = logits.clone()
+
+    apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
+                                     repetition_penalties)
+    apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
+                                    repetition_penalties)
+
+    # Compare all outputs to reference
+    torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
+
+    # Test the operator by applying the opcheck utility
+    opcheck(torch.ops._C.apply_repetition_penalties_,
+            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 008a7aa94..3282edf41 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -282,6 +282,45 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
     torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
 
 
+def apply_repetition_penalties_torch(
+        logits: torch.Tensor, prompt_mask: torch.Tensor,
+        output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None:
+    repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(
+        1, logits.size(1))
+    # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
+    penalties = torch.where(prompt_mask | output_mask, repetition_penalties,
+                            1.0)
+    # If logits are positive, divide by penalty, otherwise multiply by penalty.
+    scaling = torch.where(logits > 0, 1.0 / penalties, penalties)
+    logits *= scaling
+
+
+def apply_repetition_penalties_cuda(
+        logits: torch.Tensor, prompt_mask: torch.Tensor,
+        output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None:
+    torch.ops._C.apply_repetition_penalties_(logits, prompt_mask, output_mask,
+                                             repetition_penalties)
+
+
+def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor,
+                               output_mask: torch.Tensor,
+                               repetition_penalties: torch.Tensor) -> None:
+    """Apply repetition penalties to logits in-place.
+
+    Args:
+        logits: The logits tensor of shape [num_seqs, vocab_size].
+        prompt_mask: A boolean tensor indicating which tokens appear in the prompt.
+        output_mask: A boolean tensor indicating which tokens appear in the output.
+        repetition_penalties: The repetition penalties of shape (num_seqs, ).
+    """
+    if current_platform.is_cuda() and logits.is_contiguous():
+        apply_repetition_penalties_cuda(logits, prompt_mask, output_mask,
+                                        repetition_penalties)
+    else:
+        apply_repetition_penalties_torch(logits, prompt_mask, output_mask,
+                                         repetition_penalties)
+
+
 def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int,
                            input_tokens: torch.Tensor,
                            sampled_token_ids: torch.Tensor,
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index d97d84238..41b5253dc 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -50,16 +50,11 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
                                                    vocab_size, num_seqs)
     output_bin_counts, output_mask = get_token_bin_counts_and_mask(
         output_tokens_tensor, vocab_size, num_seqs)
-    repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(
-        1, vocab_size)
 
-    # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
-    penalties = torch.where(prompt_mask | output_mask, repetition_penalties,
-                            1.0)
-
-    # If logits are positive, divide by penalty, otherwise multiply by penalty.
-    scaling = torch.where(logits > 0, 1.0 / penalties, penalties)
-    logits *= scaling
+    # Apply repetition penalties as a custom op
+    from vllm._custom_ops import apply_repetition_penalties
+    apply_repetition_penalties(logits, prompt_mask, output_mask,
+                               repetition_penalties)
 
     # We follow the definition in OpenAI API.
     # Refer to https://platform.openai.com/docs/api-reference/parameter-details
-- 
GitLab


From 8d646c2e53d3d840a3442bdd00845a6b57eb666f Mon Sep 17 00:00:00 2001
From: Calvin Chen <45745657+calvin0327@users.noreply.github.com>
Date: Wed, 4 Jun 2025 12:23:26 +0800
Subject: [PATCH 159/274] [Cleanup][v1]:remote guided-decoding-backend for
 example (#19059)

Signed-off-by: calvin chen <120380290@qq.com>
---
 .../online_serving/openai_chat_completion_structured_outputs.py  | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
index 64379083d..5c55d5313 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
@@ -139,7 +139,6 @@ def extra_backend_options_completion(client: OpenAI, model: str):
             extra_body={
                 "guided_regex": r"\w+@\w+\.com\n",
                 "stop": ["\n"],
-                "guided_decoding_backend": "xgrammar",
                 "guided_decoding_disable_fallback": True,
             },
         )
-- 
GitLab


From 41aa5784287f00b026f3ba225ac18ab3caccc622 Mon Sep 17 00:00:00 2001
From: Kaixi Hou <kaixih@nvidia.com>
Date: Wed, 4 Jun 2025 12:40:26 +0800
Subject: [PATCH 160/274] [NVIDIA] Add Cutlass MLA backend (#17625)

---
 csrc/attention/mla/cutlass_mla_kernels.cu     |  2 +-
 tests/kernels/test_cutlass_mla_decode.py      |  4 +-
 vllm/engine/arg_utils.py                      |  1 +
 vllm/platforms/cuda.py                        |  8 ++
 vllm/platforms/interface.py                   |  1 +
 vllm/v1/attention/backends/mla/common.py      |  2 +-
 vllm/v1/attention/backends/mla/cutlass_mla.py | 96 +++++++++++++++++++
 7 files changed, 111 insertions(+), 3 deletions(-)
 create mode 100644 vllm/v1/attention/backends/mla/cutlass_mla.py

diff --git a/csrc/attention/mla/cutlass_mla_kernels.cu b/csrc/attention/mla/cutlass_mla_kernels.cu
index 6743af0cf..f4b6b19f4 100644
--- a/csrc/attention/mla/cutlass_mla_kernels.cu
+++ b/csrc/attention/mla/cutlass_mla_kernels.cu
@@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options(
       {static_cast<ElementOut*>(out.data_ptr()), stride_O,
        static_cast<ElementAcc*>(nullptr), stride_LSE},
       hw_info,
-      -1,       // split_kv
+      1,        // split_kv
       nullptr,  // is_var_split_kv
   };
   // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py
index c56024b75..2b745b84d 100644
--- a/tests/kernels/test_cutlass_mla_decode.py
+++ b/tests/kernels/test_cutlass_mla_decode.py
@@ -76,7 +76,9 @@ def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
     pack_factor = 128 // block_size
     block_num = ((block_num + pack_factor - 1) // pack_factor) * pack_factor
 
-    q = torch.randn(bs, h_q, d)
+    # Amplify input values to ensure test coverage of edge cases where CUTLASS
+    # kernel errors occur with split_k settings.
+    q = torch.randn(bs, h_q, d) * 100
     block_table = torch.randint(0,
                                 bs * block_num, (bs, block_num),
                                 dtype=torch.int32)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b1c4b27a0..901346831 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1395,6 +1395,7 @@ class EngineArgs:
             "PALLAS_VLLM_V1",
             "TRITON_ATTN_VLLM_V1",
             "TRITON_MLA",
+            "CUTLASS_MLA_VLLM_V1",
             "FLASHMLA",
             "FLASHINFER",
             "FLASHINFER_VLLM_V1",
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 07ae470fa..bde606f0c 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -183,6 +183,14 @@ class CudaPlatformBase(Platform):
         if use_mla:
             # TODO(lucas): refactor to  be more concise
             #  we should probably consider factoring out V1 here
+            if selected_backend == _Backend.CUTLASS_MLA_VLLM_V1:
+                if use_v1:
+                    logger.info_once("Using Cutlass MLA backend on V1 engine.")
+                    return ("vllm.v1.attention.backends.mla."
+                            "cutlass_mla.CutlassMLABackend")
+                else:
+                    logger.warning(
+                        "Cutlass MLA backend is only supported on V1 engine")
             if selected_backend == _Backend.TRITON_MLA or block_size != 64:
                 if use_v1:
                     logger.info_once("Using Triton MLA backend on V1 engine.")
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 1ec9c78a3..7fef697d8 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -51,6 +51,7 @@ class _Backend(enum.Enum):
     TRITON_MLA_VLLM_V1 = enum.auto()
     FLASHMLA_VLLM_V1 = enum.auto()
     FLASHMLA = enum.auto()  # Supported by V1
+    CUTLASS_MLA_VLLM_V1 = enum.auto()
     HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
     PALLAS_VLLM_V1 = enum.auto()
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 06acbb909..e6b4f6404 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -350,7 +350,7 @@ class MLACommonMetadataBuilder(Generic[M]):
         self.num_heads = model_config.get_num_attention_heads(
             runner.parallel_config)
         self.mla_dims = get_mla_dims(model_config)
-        self.aot_schedule = is_vllm_fa and (get_flash_attn_version() == 3)
+        self.aot_schedule = current_platform.is_cuda()
         self.kv_cache_spec = kv_cache_spec
 
         # Dont try to access the runner on AMD
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
new file mode 100644
index 000000000..70aee058e
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+
+import vllm._custom_ops as ops
+from vllm.attention.backends.abstract import (AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonImpl,
+                                                   MLACommonMetadata)
+
+logger = init_logger(__name__)
+
+
+class CutlassMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "CUTLASS_MLA_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["CutlassMLAImpl"]:
+        return CutlassMLAImpl
+
+
+class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            blocksparse_params: Optional[dict[str, Any]],
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         blocksparse_params, logits_soft_cap, attn_type,
+                         **mla_args)
+
+        unsupported_features = [
+            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
+        ]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "CutlassMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, blocksparse_params, "
+                "logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "CutlassMLAImpl")
+
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "CutlassMLA V1 with FP8 KV cache not yet supported")
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError("FP8 Cutlass MLA not yet supported")
+
+        B = q_nope.shape[0]
+
+        o = torch.empty((B, self.num_heads, self.kv_lora_rank),
+                        dtype=q_nope.dtype,
+                        device=q_nope.device)
+
+        # Run MLA
+        # Clone q_nope and q_pe to make sure strides computation is correct.
+        q_nope = q_nope.clone()
+        q_pe = q_pe.clone()
+        ops.cutlass_mla_decode(o, q_nope, q_pe, kv_c_and_k_pe_cache,
+                               attn_metadata.decode.seq_lens,
+                               attn_metadata.decode.block_table, self.scale)
+
+        return self._v_up_proj(o)
-- 
GitLab


From b124e1085b1bf977e3dac96d99ffd9d8ddfdb6cc Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 3 Jun 2025 23:10:15 -0700
Subject: [PATCH 161/274] [Bugfix] Fix FA3 full cuda graph correctness (#19106)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .buildkite/test-pipeline.yaml                 |  1 +
 .../compile/piecewise/test_full_cudagraph.py  |  7 +++--
 vllm/v1/attention/backends/flash_attn.py      | 29 ++++++++++++++-----
 vllm/v1/worker/gpu_model_runner.py            |  5 ++++
 4 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8ab96b3b7..4ee6b499b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -320,6 +320,7 @@ steps:
   # these tests need to be separated, cannot combine
   - pytest -v -s compile/piecewise/test_simple.py
   - pytest -v -s compile/piecewise/test_toy_llama.py
+  - pytest -v -s compile/piecewise/test_full_cudagraph.py
 
 - label: PyTorch Fullgraph Test # 18min
   mirror_hardwares: [amdexperimental, amdproduction]
diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py
index 3188ea40f..134bade48 100644
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -7,6 +7,7 @@ import pytest
 
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig
+from vllm.platforms import current_platform
 
 MODEL = "Qwen/Qwen2-1.5B-Instruct"
 
@@ -37,7 +38,7 @@ def full_cudagraph_llm():
             "VLLM_FLASH_ATTN_VERSION": "3"
     }):
         return LLM(model=MODEL,
-                   gpu_memory_utilization=0.2,
+                   gpu_memory_utilization=0.3,
                    compilation_config=CompilationConfig(full_cuda_graph=True))
 
 
@@ -48,7 +49,7 @@ def piecewise_llm():
             "VLLM_FLASH_ATTN_VERSION": "3"
     }):
         return LLM(model=MODEL,
-                   gpu_memory_utilization=0.5,
+                   gpu_memory_utilization=0.6,
                    compilation_config=CompilationConfig())
 
 
@@ -61,6 +62,8 @@ def generate_text(llm: LLM, batch_size: int, max_tokens: int):
     return llm.generate(prompts, sampling_params)
 
 
+@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0),
+                    reason="Only Hopper GPUs support FlashAttention 3")
 @pytest.mark.parametrize(("batch_size", "max_tokens"), [(1, 10), (7, 10),
                                                         (16, 10), (25, 10),
                                                         (32, 10), (45, 10),
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index a92c51883..a9f748d02 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -307,13 +307,14 @@ class FlashAttentionMetadataBuilder:
         self.kv_cache_spec = kv_cache_spec
         self.block_table = block_table
 
-        if get_flash_attn_version() == 3:
-            self.aot_schedule = not compilation_config.full_cuda_graph
-            if not self.aot_schedule:
-                logger.warning(
-                    "AOT Schedule is disabled when using full_cuda_graph")
-        else:
-            self.aot_schedule = False
+        self.aot_schedule = (get_flash_attn_version() == 3)
+        self.use_full_cuda_graph = compilation_config.full_cuda_graph
+        if self.use_full_cuda_graph and not self.aot_schedule:
+            raise ValueError("Full CUDA graph mode requires AOT scheduling, "
+                             "which requires FlashAttention 3.")
+        self.scheduler_metadata = torch.zeros(self.runner.max_num_reqs + 1,
+                                              dtype=torch.int32,
+                                              device=self.runner.device)
 
         # Sliding window size to be used with the AOT scheduler will be
         # populated on first build() call.
@@ -326,7 +327,7 @@ class FlashAttentionMetadataBuilder:
     def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata):
-        max_seq_len = self.runner.seq_lens_np[:num_reqs].max()
+        max_seq_len = int(self.runner.seq_lens_np[:num_reqs].max())
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
         block_table = self.block_table
@@ -448,6 +449,18 @@ class FlashAttentionMetadataBuilder:
                                           max_seq_len=max_seq_len,
                                           causal=True)
 
+        if self.use_full_cuda_graph:
+            assert scheduler_metadata is not None
+            n = scheduler_metadata.shape[0]
+            self.scheduler_metadata[:n].copy_(scheduler_metadata,
+                                              non_blocking=True)
+            # NOTE(woosuk): We should zero out the rest of the scheduler
+            # metadata to guarantee the correctness. Otherwise, some thread
+            # blocks may use the invalid scheduler metadata and overwrite the
+            # output buffer.
+            self.scheduler_metadata[n:] = 0
+            scheduler_metadata = self.scheduler_metadata[:n]
+
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
             max_query_len=max_query_len,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9ac33a149..4a67e3778 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1750,6 +1750,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             attn_metadata: Optional[dict[str, Any]] = None
         else:
             query_start_loc = self.query_start_loc[:num_reqs + 1]
+            # Make sure max_model_len is used at the graph capture time.
+            self.seq_lens_np[:num_reqs] = self.max_model_len
+            self.seq_lens_np[num_reqs:] = 0
+            self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs],
+                                           non_blocking=True)
             seq_lens = self.seq_lens[:num_reqs]
 
             common_attn_metadata = CommonAttentionMetadata(
-- 
GitLab


From 3336c8cfbef6c7d6688ca1e5b0b26424baef02c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= <wangzhipeng628@gmail.com>
Date: Wed, 4 Jun 2025 16:42:06 +0800
Subject: [PATCH 162/274] Fix #19130 (#19132)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
---
 .../vision_language_multi_image.py            | 36 +++++++++++++------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index de6365c0d..ea7a793d0 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -593,21 +593,21 @@ def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
 
 def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     try:
-        from qwen_vl_utils import process_vision_info
+        from qwen_vl_utils import smart_resize
     except ModuleNotFoundError:
         print(
             "WARNING: `qwen-vl-utils` not installed, input images will not "
             "be automatically resized. You can enable this functionality by "
             "`pip install qwen-vl-utils`."
         )
-        process_vision_info = None
+        smart_resize = None
 
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
     # Tested on L40
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=32768 if process_vision_info is None else 4096,
+        max_model_len=32768 if smart_resize is None else 4096,
         max_num_seqs=5,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
@@ -630,10 +630,18 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
         messages, tokenize=False, add_generation_prompt=True
     )
 
-    if process_vision_info is None:
+    if smart_resize is None:
         image_data = [fetch_image(url) for url in image_urls]
     else:
-        image_data, _ = process_vision_info(messages)
+
+        def post_process_image(image: Image) -> Image:
+            width, height = image.size
+            resized_height, resized_width = smart_resize(
+                height, width, max_pixels=1024 * 28 * 28
+            )
+            return image.resize((resized_width, resized_height))
+
+        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -644,20 +652,20 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
 
 def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     try:
-        from qwen_vl_utils import process_vision_info
+        from qwen_vl_utils import smart_resize
     except ModuleNotFoundError:
         print(
             "WARNING: `qwen-vl-utils` not installed, input images will not "
             "be automatically resized. You can enable this functionality by "
             "`pip install qwen-vl-utils`."
         )
-        process_vision_info = None
+        smart_resize = None
 
     model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
 
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=32768 if process_vision_info is None else 4096,
+        max_model_len=32768 if smart_resize is None else 4096,
         max_num_seqs=5,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
@@ -680,10 +688,18 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
         messages, tokenize=False, add_generation_prompt=True
     )
 
-    if process_vision_info is None:
+    if smart_resize is None:
         image_data = [fetch_image(url) for url in image_urls]
     else:
-        image_data, _ = process_vision_info(messages, return_video_kwargs=False)
+
+        def post_process_image(image: Image) -> Image:
+            width, height = image.size
+            resized_height, resized_width = smart_resize(
+                height, width, max_pixels=1024 * 28 * 28
+            )
+            return image.resize((resized_width, resized_height))
+
+        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
 
     return ModelRequestData(
         engine_args=engine_args,
-- 
GitLab


From 8e972d9c44cc8a6b1d0a3596c41604c56a492977 Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Wed, 4 Jun 2025 01:43:00 -0700
Subject: [PATCH 163/274] [TPU] Skip hanging tests (#19115)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
---
 .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh |  2 +-
 tests/v1/tpu/test_spmd_model_weight_loading.py    | 13 ++++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 3212b660e..a394046d2 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -150,7 +150,7 @@ run_and_track_test 9 "test_multimodal.py" \
 run_and_track_test 10 "test_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
 run_and_track_test 11 "test_struct_output_generate.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k 'not test_structured_output_with_reasoning_matrices'"
 run_and_track_test 12 "test_moe_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 run_and_track_test 13 "test_lora.py" \
diff --git a/tests/v1/tpu/test_spmd_model_weight_loading.py b/tests/v1/tpu/test_spmd_model_weight_loading.py
index d36edfc3f..916325e41 100644
--- a/tests/v1/tpu/test_spmd_model_weight_loading.py
+++ b/tests/v1/tpu/test_spmd_model_weight_loading.py
@@ -45,11 +45,14 @@ def _get_spmd_mesh():
     return MESH
 
 
-@pytest.mark.parametrize("model", [
-    "Qwen/Qwen2-1.5B-Instruct",
-    "meta-llama/Llama-3.1-8B-Instruct",
-    "meta-llama/Llama-3.1-70B-Instruct",
-])
+@pytest.mark.parametrize(
+    "model",
+    [
+        "Qwen/Qwen2-1.5B-Instruct",
+        # Skip large models due to CI runner disk space limitations
+        # "meta-llama/Llama-3.1-8B-Instruct",
+        # "meta-llama/Llama-3.1-70B-Instruct",
+    ])
 def test_tpu_model_loader(model):
     # Skip the 70B test if there are less than 8 chips
     # TODO: Query using torch xla API, the query API is not working
-- 
GitLab


From 2669a0d7b518371bb1d950425bd64a320010733f Mon Sep 17 00:00:00 2001
From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Date: Wed, 4 Jun 2025 02:10:45 -0700
Subject: [PATCH 164/274] Fix ValueError: Missing value for tag key(s):
 model_name,engine. (#19113)

Signed-off-by: Seiji Eicher <seiji@anyscale.com>
---
 tests/v1/metrics/test_ray_metrics.py |  5 ++++-
 vllm/v1/metrics/ray_wrappers.py      | 10 ++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py
index ea54038a2..0898ae65e 100644
--- a/tests/v1/metrics/test_ray_metrics.py
+++ b/tests/v1/metrics/test_ray_metrics.py
@@ -47,12 +47,15 @@ def test_engine_log_metrics_ray(
                 engine_args, stat_loggers=[RayPrometheusStatLogger])
 
             for i, prompt in enumerate(example_prompts):
-                engine.generate(
+                results = engine.generate(
                     request_id=f"request-id-{i}",
                     prompt=prompt,
                     sampling_params=SamplingParams(max_tokens=max_tokens),
                 )
 
+                async for _ in results:
+                    pass
+
     # Create the actor and call the async method
     actor = EngineTestActor.remote()  # type: ignore[attr-defined]
     ray.get(actor.run.remote())
diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py
index 18c8dcf0a..cce692d6c 100644
--- a/vllm/v1/metrics/ray_wrappers.py
+++ b/vllm/v1/metrics/ray_wrappers.py
@@ -31,6 +31,16 @@ class RayPrometheusMetric:
 
             self.metric.set_default_tags(labelskwargs)
 
+        if labels:
+            if len(labels) != len(self.metric._tag_keys):
+                raise ValueError(
+                    "Number of labels must match the number of tag keys. "
+                    f"Expected {len(self.metric._tag_keys)}, got {len(labels)}"
+                )
+
+            self.metric.set_default_tags(
+                dict(zip(self.metric._tag_keys, labels)))
+
         return self
 
 
-- 
GitLab


From 8711bc5e684d43a333c0c20bef575a0d8ee8346f Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 4 Jun 2025 19:18:48 +0800
Subject: [PATCH 165/274] [Misc] Add packages for benchmark as extra dependency
 (#19089)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 docs/cli/README.md          |  2 ++
 setup.py                    |  1 +
 vllm/benchmarks/datasets.py | 39 ++++++++++++++++---------------------
 3 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/docs/cli/README.md b/docs/cli/README.md
index f43ce7663..df700fb74 100644
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@@ -77,6 +77,8 @@ vllm complete --quick "The future of AI is"
 
 Run benchmark tests for latency online serving throughput and offline inference throughput.
 
+To use benchmark commands, please install with extra dependencies using `pip install vllm[bench]`.
+
 Available Commands:
 
 ```bash
diff --git a/setup.py b/setup.py
index b07cdea30..ea7cd0169 100644
--- a/setup.py
+++ b/setup.py
@@ -688,6 +688,7 @@ setup(
     ext_modules=ext_modules,
     install_requires=get_requirements(),
     extras_require={
+        "bench": ["pandas", "datasets"],
         "tensorizer": ["tensorizer>=2.9.0"],
         "fastsafetensors": ["fastsafetensors >= 0.1.10"],
         "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index f795a1256..4da9f7368 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -24,7 +24,6 @@ from io import BytesIO
 from typing import Any, Callable, Optional, Union
 
 import numpy as np
-import pandas as pd
 from PIL import Image
 from transformers import PreTrainedTokenizerBase
 
@@ -33,6 +32,23 @@ from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.image import convert_image_mode
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+from vllm.utils import PlaceholderModule
+
+try:
+    from datasets import load_dataset
+except ImportError:
+    datasets = PlaceholderModule("datasets")
+    load_dataset = datasets.placeholder_attr("load_dataset")
+
+try:
+    import pandas as pd
+except ImportError:
+    pd = PlaceholderModule("pandas")
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")
 
 logger = logging.getLogger(__name__)
 
@@ -636,13 +652,6 @@ class BurstGPTDataset(BenchmarkDataset):
         if self.dataset_path is None:
             raise ValueError("dataset_path must be provided for loading data.")
 
-        try:
-            import pandas as pd
-        except ImportError as e:
-            raise ImportError(
-                "Pandas is required for BurstGPTDataset. Please install it "
-                "using `pip install pandas`.") from e
-
         df = pd.read_csv(self.dataset_path)
         # Filter to keep only GPT-4 rows.
         gpt4_df = df[df["Model"] == "GPT-4"]
@@ -717,13 +726,6 @@ class HuggingFaceDataset(BenchmarkDataset):
 
     def load_data(self) -> None:
         """Load data from HuggingFace datasets."""
-        try:
-            from datasets import load_dataset
-        except ImportError as e:
-            raise ImportError(
-                "Hugging Face datasets library is required for this dataset. "
-                "Please install it using `pip install datasets`.") from e
-
         self.data = load_dataset(
             self.dataset_path,
             name=self.dataset_subset,
@@ -1147,13 +1149,6 @@ class ASRDataset(HuggingFaceDataset):
         output_len: Optional[int] = None,
         **kwargs,
     ) -> list:
-        try:
-            import librosa
-        except ImportError as e:
-            raise ImportError(
-                "librosa is required for ASRDataset. Please install it "
-                "using `pip install librosa`.") from e
-
         output_len = (output_len
                       if output_len is not None else self.DEFAULT_OUTPUT_LEN)
         prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
-- 
GitLab


From 35cf32df304770b9dd3878438544b3a1a1cc79a5 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Wed, 4 Jun 2025 19:48:57 +0800
Subject: [PATCH 166/274] Improve the output precision of embedding models
 (#19092)

---
 tests/models/language/pooling/embed_utils.py  |  6 +--
 tests/models/language/pooling/mteb_utils.py   | 12 ++---
 tests/models/language/pooling/test_gte.py     |  7 ---
 .../models/language/pooling/test_intfloat.py  | 46 +++++++++++++++++++
 tests/models/language/pooling/test_jina.py    |  3 +-
 tests/models/language/pooling/test_nomic.py   |  3 --
 vllm/model_executor/models/bert.py            | 13 ++++--
 vllm/model_executor/models/bert_with_rope.py  |  7 ++-
 8 files changed, 69 insertions(+), 28 deletions(-)
 create mode 100644 tests/models/language/pooling/test_intfloat.py

diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py
index 07bc9f447..dabd7bee7 100644
--- a/tests/models/language/pooling/embed_utils.py
+++ b/tests/models/language/pooling/embed_utils.py
@@ -56,14 +56,10 @@ def correctness_test_embed_models(hf_runner,
                      max_model_len=None,
                      **vllm_extra_kwargs) as vllm_model:
         vllm_outputs = vllm_model.encode(example_prompts)
-        vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
-        model_dtype = getattr(
-            vllm_model.model.llm_engine.model_config.hf_config, "torch_dtype",
-            vllm_dtype)
 
     with hf_runner(
             model_info.name,
-            dtype=model_dtype,
+            dtype="float32",
             is_sentence_transformer=True,
     ) as hf_model:
 
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index 2705be25e..0a047951d 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -7,7 +7,6 @@ import numpy as np
 import pytest
 
 from tests.models.utils import EmbedModelInfo
-from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 
 # Most models on the STS12 task (See #17175):
 # - Model implementation and minor changes in tensor dtype
@@ -104,17 +103,18 @@ def mteb_test_embed_models(hf_runner,
                                               MTEB_EMBED_TASKS)
         vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
 
-    with set_default_torch_dtype(vllm_dtype) and hf_runner(
-            model_info.name, is_sentence_transformer=True,
-            dtype=vllm_dtype) as hf_model:
+    with hf_runner(model_info.name,
+                   is_sentence_transformer=True,
+                   dtype="float32") as hf_model:
 
         if hf_model_callback is not None:
             hf_model_callback(hf_model)
 
         st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
+        st_dtype = next(hf_model.model.parameters()).dtype
 
-    print("VLLM:", vllm_main_score)
-    print("SentenceTransformers:", st_main_score)
+    print("VLLM:", vllm_dtype, vllm_main_score)
+    print("SentenceTransformers:", st_dtype, st_main_score)
     print("Difference:", st_main_score - vllm_main_score)
 
     assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index 2178a815b..05bd479f4 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -11,27 +11,21 @@ MODELS = [
     ########## BertModel
     EmbedModelInfo("thenlper/gte-large",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=True),
     EmbedModelInfo("thenlper/gte-base",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=False),
     EmbedModelInfo("thenlper/gte-small",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=False),
     EmbedModelInfo("thenlper/gte-large-zh",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=False),
     EmbedModelInfo("thenlper/gte-base-zh",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=False),
     EmbedModelInfo("thenlper/gte-small-zh",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=False),
     ########### NewModel
     EmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
@@ -46,7 +40,6 @@ MODELS = [
     ########### Qwen2ForCausalLM
     EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
                    architecture="Qwen2ForCausalLM",
-                   dtype="float32",
                    enable_test=True),
     ########## ModernBertModel
     EmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py
new file mode 100644
index 000000000..b6e83857f
--- /dev/null
+++ b/tests/models/language/pooling/test_intfloat.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+from ...utils import EmbedModelInfo
+from .embed_utils import correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models
+
+MODELS = [
+    ########## BertModel
+    EmbedModelInfo("intfloat/e5-small",
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("intfloat/e5-base",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("intfloat/e5-large",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("intfloat/multilingual-e5-small",
+                   architecture="BertModel",
+                   enable_test=False),
+    ########## XLMRobertaModel
+    EmbedModelInfo("intfloat/multilingual-e5-base",
+                   architecture="XLMRobertaModel",
+                   enable_test=True),
+    EmbedModelInfo("intfloat/multilingual-e5-large",
+                   architecture="XLMRobertaModel",
+                   enable_test=False),
+    EmbedModelInfo("intfloat/multilingual-e5-large-instruct",
+                   architecture="XLMRobertaModel",
+                   enable_test=False),
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
+                                  example_prompts)
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
index 2adf34b29..33255021a 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -32,8 +32,7 @@ TEXTS_2 = [
 EMBEDDING_MODELS = [
     EmbedModelInfo("jinaai/jina-embeddings-v3",
                    architecture="XLMRobertaModel",
-                   is_matryoshka=True,
-                   dtype="float32")
+                   is_matryoshka=True)
 ]
 
 
diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py
index 59dbd74fb..e16ec239a 100644
--- a/tests/models/language/pooling/test_nomic.py
+++ b/tests/models/language/pooling/test_nomic.py
@@ -9,18 +9,15 @@ from .mteb_utils import mteb_test_embed_models
 MODELS = [
     EmbedModelInfo("nomic-ai/nomic-embed-text-v1",
                    architecture="NomicBertModel",
-                   dtype="float32",
                    enable_test=True),
     EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
                    architecture="NomicBertModel",
-                   dtype="float32",
                    enable_test=False),
     EmbedModelInfo("nomic-ai/CodeRankEmbed",
                    architecture="NomicBertModel",
                    enable_test=False),
     EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
                    architecture="NomicBertModel",
-                   dtype="float32",
                    enable_test=True)
 ]
 
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 389393987..cacec7342 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -414,10 +414,15 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        return self.model(input_ids=input_ids,
-                          position_ids=positions,
-                          inputs_embeds=inputs_embeds,
-                          intermediate_tensors=intermediate_tensors)
+        hidden_states = self.model(input_ids=input_ids,
+                                   position_ids=positions,
+                                   inputs_embeds=inputs_embeds,
+                                   intermediate_tensors=intermediate_tensors)
+
+        # convert the embedding output to float32,
+        # otherwise precision will be lost significantly
+        hidden_states = hidden_states.to(torch.float32)
+        return hidden_states
 
     def pooler(
         self,
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index 0f22393c7..d1b84a9f0 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -432,7 +432,12 @@ class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant):
         else:
             hidden_states = self.embeddings(input_ids=input_ids,
                                             token_type_ids=token_type_ids)
-        return self.encoder(positions, hidden_states)
+        hidden_states = self.encoder(positions, hidden_states)
+
+        # convert the embedding output to float32,
+        # otherwise precision will be lost significantly
+        hidden_states = hidden_states.to(torch.float32)
+        return hidden_states
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-- 
GitLab


From 01dc9a76db7d314aaf51be9ffc6ff561bae5626f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 4 Jun 2025 19:49:20 +0800
Subject: [PATCH 167/274] [CI/Build][Bugfix] Ensure compatibility with
 transformers 4.52 (#18678)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 requirements/test.in                          |  2 +-
 requirements/test.txt                         |  2 +-
 .../multimodal/generation/test_common.py      |  9 +++-
 .../multimodal/generation/test_florence2.py   |  2 +
 .../generation/test_granite_speech.py         |  2 +-
 .../multimodal/generation/test_phi4mm.py      |  4 ++
 .../generation/vlm_utils/model_utils.py       | 18 ++++++-
 .../multimodal/processing/test_common.py      |  2 +-
 tests/models/registry.py                      | 47 ++++++-------------
 tests/models/test_initialization.py           | 11 +++++
 vllm/config.py                                |  2 +
 vllm/model_executor/models/aya_vision.py      | 12 +++--
 vllm/model_executor/models/idefics3.py        | 16 +++++--
 13 files changed, 82 insertions(+), 47 deletions(-)

diff --git a/requirements/test.in b/requirements/test.in
index 9b574a09f..bbbd41e16 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -34,7 +34,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
-transformers==4.51.3
+transformers==4.52.4
 tokenizers==0.21.1
 huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
 schemathesis>=3.39.15 # Required for openai schema test.
diff --git a/requirements/test.txt b/requirements/test.txt
index 03aec80ac..fb0eede08 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -794,7 +794,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.51.3
+transformers==4.52.4
     # via
     #   -r requirements/test.in
     #   genai-perf
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index a5bbcfc22..496850b19 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -226,6 +226,8 @@ VLM_TEST_SETTINGS = {
         img_idx_to_prompt=lambda idx: "",
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
+        # FIXME: https://github.com/huggingface/transformers/pull/38510
+        marks=[pytest.mark.skip("Model is broken")],
     ),
     "chameleon": VLMTestInfo(
         models=["facebook/chameleon-7b"],
@@ -281,10 +283,10 @@ VLM_TEST_SETTINGS = {
         multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
-        dtype="bfloat16",
         auto_cls=AutoModelForImageTextToText,
         vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
         patch_hf_runner=model_utils.gemma3_patch_hf_runner,
+        num_logprobs=10,
     ),
     "glm4v": VLMTestInfo(
         models=["THUDM/glm-4v-9b"],
@@ -337,7 +339,8 @@ VLM_TEST_SETTINGS = {
         models=[
             "OpenGVLab/InternVL2-1B",
             "OpenGVLab/InternVL2-2B",
-            "OpenGVLab/Mono-InternVL-2B",
+            # FIXME: Config cannot be loaded in transformers 4.52
+            # "OpenGVLab/Mono-InternVL-2B",
         ],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
@@ -568,6 +571,8 @@ VLM_TEST_SETTINGS = {
         max_num_seqs=2,
         vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
         prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
+        # FIXME: https://github.com/huggingface/transformers/issues/38358
+        marks=[pytest.mark.skip("Model initialization fails")],
     ),
     "qwen2_vl": VLMTestInfo(
         models=["Qwen/Qwen2-VL-2B-Instruct"],
diff --git a/tests/models/multimodal/generation/test_florence2.py b/tests/models/multimodal/generation/test_florence2.py
index b048cec5e..a622957f9 100644
--- a/tests/models/multimodal/generation/test_florence2.py
+++ b/tests/models/multimodal/generation/test_florence2.py
@@ -100,6 +100,8 @@ def run_test(
         )
 
 
+# FIXME: https://github.com/huggingface/transformers/issues/38358
+@pytest.mark.skip("Model initialization fails")
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize(
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index 14552010d..c5ffa5f3a 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -29,7 +29,7 @@ def vllm_to_hf_output(
     return output_ids, hf_output_str, out_logprobs
 
 
-MODEL_NAME = "ibm-granite/granite-speech-3.3-8b"
+MODEL_NAME = "ibm-granite/granite-speech-3.3-2b"
 # Audio lora co-exists directly in the model directory, but
 # currently still needs to be passed directly to vLLM.
 audio_lora_path = MODEL_NAME
diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py
index e4cd476a9..4e8465778 100644
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@@ -122,6 +122,10 @@ def run_test(
             for prompts, images, audios in inputs
         ]
 
+    # This error occurs inside `get_peft_model`
+    # FIXME: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/75
+    pytest.skip("HF impl is not compatible with current transformers")
+
     hf_model_kwargs = {"_attn_implementation": "sdpa"}
     with hf_runner(model, dtype=dtype,
                    model_kwargs=hf_model_kwargs) as hf_model:
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index 1b087191f..af4c72f44 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -10,11 +10,12 @@ from typing import Optional, Union
 
 import numpy as np
 import numpy.typing as npt
+import pytest
 import regex as re
 import torch
 from PIL.Image import Image
 from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
-                          GenerationConfig)
+                          GenerationConfig, GenerationMixin)
 
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import patch_padding_side
@@ -324,6 +325,16 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
 
     hf_model.processor = processor
 
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, **kwargs):
+        # FIXME: https://github.com/huggingface/transformers/issues/38333
+        kwargs["disable_compile"] = True
+
+        return orig_generate(*args, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
     return hf_model
 
 
@@ -610,6 +621,11 @@ def _internvl_generate(
     if getattr(self, "use_visual_token_mask", False):
         visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
         forward_kwargs["visual_token_mask"] = visual_token_mask
+
+    # e.g. InternVL2-2B
+    if not isinstance(self.language_model, GenerationMixin):
+        pytest.skip("HF impl is not compatible with current transformers")
+
     outputs = self.language_model.generate(
         **forward_kwargs,
         **generate_kwargs,
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index be574435e..1e6608955 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -245,7 +245,7 @@ def _test_processing_correctness_one(
     "adept/fuyu-8b",
     "google/gemma-3-4b-it",
     "THUDM/glm-4v-9b",
-    "ibm-granite/granite-speech-3.3-8b",
+    "ibm-granite/granite-speech-3.3-2b",
     "h2oai/h2ovl-mississippi-800m",
     "OpenGVLab/InternVL2-1B",
     "OpenGVLab/InternVL3-1B",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index ed49676a9..3e07dc0f3 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -160,17 +160,12 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
     "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct",
-                                          is_available_online=False,
-                                          min_transformers_version="4.52.2"),
+                                          min_transformers_version="4.53"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
     "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
     "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
-    "Glm4ForCausalLM": _HfExamplesInfo(
-        "THUDM/GLM-4-32B-0414",
-        is_available_online=False,
-        min_transformers_version="4.52.dev0"
-    ),
+    "Glm4ForCausalLM": _HfExamplesInfo("THUDM/GLM-4-9B-0414"),
     "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2",
                                        {"alias": "gpt2"}),
     "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder",
@@ -181,8 +176,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                           {"1b": "EleutherAI/pythia-1.4b"}),
     "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
     "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
-    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview",  # noqa: E501
-                                                   min_transformers_version="4.52.0"),  # noqa: E501
+    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"),  # noqa: E501
     "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"),  # noqa: E501
     "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
                                              trust_remote_code=True),
@@ -203,8 +197,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
                                         is_available_online=False),
     "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
-    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1",
-                                         is_available_online=False),
+    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
     "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),  # noqa: E501
     "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16",
                                          trust_remote_code=True),
@@ -243,10 +236,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
     "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
     "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
-    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
-                                     is_available_online=False),
+    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
     "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
-                                                is_available_online=False),
+                                                v0_only=True),
     "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t",
                                            v0_only=True),
     "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
@@ -256,7 +248,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407",
                                             trust_remote_code=True),
     "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
-                                         is_available_online=False,
+                                         tokenizer="meta-llama/Llama-2-7b",
                                          trust_remote_code=True),
     "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
     "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
@@ -275,8 +267,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
                                                trust_remote_code=True),
     "GteNewModel": _HfExamplesInfo("Alibaba-NLP/gte-base-en-v1.5",
                                    trust_remote_code=True,
-                                   hf_overrides={"architectures":
-                                                     ["GteNewModel"]}),
+                                   hf_overrides={"architectures": ["GteNewModel"]}),  # noqa: E501
     "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward",
                                                trust_remote_code=True),
     "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
@@ -298,10 +289,8 @@ _EMBEDDING_EXAMPLE_MODELS = {
     "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
                                          trust_remote_code=True),
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
-    # The model on Huggingface is currently being updated,
-    # hence I temporarily mark it as not available online
-    "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",  # noqa: E501
-                                            is_available_online=False),
+    "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
+                                            is_available_online=False),  # noqa: E501
 }
 
 _CROSS_ENCODER_EXAMPLE_MODELS = {
@@ -327,8 +316,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                 hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
-    "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-8b",  # noqa: E501
-                                                             min_transformers_version="4.52.0"),  # noqa: E501
+    "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-2b"),  # noqa: E501
     "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
@@ -347,7 +335,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                       trust_remote_code=True,
                                                       v0_only=True),
     "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
-                                                      min_transformers_version="4.51",
                                                       max_model_len=10240),
     "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
                                                      extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
@@ -360,8 +347,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                       transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                       hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
     "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
-                                max_transformers_version="4.48",
-                                transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
                                 trust_remote_code=True),
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
                                 extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
@@ -399,10 +384,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"),  # noqa: E501
-    "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B",
-                                        min_transformers_version="4.52"),
-    "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ",  # noqa: E501
-                                                           min_transformers_version="4.52"),
+    "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"),
+    "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"),  # noqa: E501
     "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
     "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
@@ -413,8 +396,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
     # Therefore, we borrow the BartTokenizer from the original Bart model
     "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
-                                                         tokenizer="Isotr0py/Florence-2-tokenizer",
-                                                         trust_remote_code=True,),  # noqa: E501
+                                                         tokenizer="Isotr0py/Florence-2-tokenizer",  # noqa: E501
+                                                         trust_remote_code=True),  # noqa: E501
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
     "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
 }
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index af023d903..98a58d01e 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -21,6 +21,10 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
+    # FIXME: Possible memory leak in the previous tests?
+    if model_arch == "GraniteSpeechForConditionalGeneration":
+        pytest.skip("Avoid OOM")
+
     # Avoid OOM and reduce initialization time by only using 1 layer
     def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
         hf_config.update(model_info.hf_overrides)
@@ -41,6 +45,13 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
                 "num_hidden_layers": 1,
             })
 
+        # e.g.: ibm-granite/granite-speech-3.3-2b
+        if hasattr(hf_config, "encoder_config"):
+            hf_config.encoder_config.update({
+                "num_layers": 1,
+                "num_hidden_layers": 1,
+            })
+
         return hf_config
 
     # Avoid calling model.forward()
diff --git a/vllm/config.py b/vllm/config.py
index f6ca9328b..a07c41dda 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3139,6 +3139,8 @@ def _find_dtype(
         config_dtype = getattr(config.get_text_config(), "torch_dtype", None)
     if config_dtype is None and hasattr(config, "vision_config"):
         config_dtype = getattr(config.vision_config, "torch_dtype", None)
+    if config_dtype is None and hasattr(config, "encoder_config"):
+        config_dtype = getattr(config.encoder_config, "torch_dtype", None)
 
     # Try to read the dtype of the weights if they are in safetensors format
     if config_dtype is None:
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 22efb707a..7e15e57a4 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -111,7 +111,13 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
         return self.ctx.get_hf_config(AyaVisionConfig)
 
     def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor:
-        return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
+        processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
+
+        # Temporary workaround since this processor has multiple image tokens
+        # See https://github.com/huggingface/transformers/issues/38350
+        processor._check_special_mm_tokens = lambda *args, **kwargs: None
+
+        return processor
 
     def get_image_processor(self) -> GotOcr2ImageProcessor:
         return self.get_hf_processor().image_processor
@@ -188,9 +194,7 @@ class AyaVisionMultiModalProcessor(
         image_processor = hf_processor.image_processor
 
         # HF processor pops the `num_patches` kwarg, which is needed by vLLM
-        if (images :=
-                mm_data.get("images")) is not None and '<image>' in prompt:
-            assert isinstance(images, list)
+        if (images := mm_data.get("images")) is not None:
             parsed_images = (self._get_data_parser().parse_mm_data({
                 "image":
                 images
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 4bc5e2a0c..de8596282 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -22,8 +22,8 @@ from typing import Literal, Optional, TypedDict, Union
 
 import torch
 from torch import nn
-from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor,
-                          Idefics3Processor)
+from transformers import (AddedToken, BatchFeature, Idefics3Config,
+                          Idefics3ImageProcessor, Idefics3Processor)
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import ReplicatedLinear
@@ -199,13 +199,21 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
 
         return grid_w * grid_h + 1
 
+    # TODO: Remove after requiring transformers>=4.52
+    def _get_content(self, token: Union[AddedToken, str]) -> str:
+        if isinstance(token, str):
+            return token
+
+        return token.content
+
     def _get_image_token(
             self,
             processor: Optional[Idefics3Processor]) -> tuple[str, str, str]:
         if processor is None:
             processor = self.get_hf_processor()
-        image_token = processor.image_token.content
-        fake_image_token = processor.fake_image_token.content
+
+        image_token = self._get_content(processor.image_token)
+        fake_image_token = self._get_content(processor.fake_image_token)
         global_image_token = processor.global_image_tag
         return image_token, fake_image_token, global_image_token
 
-- 
GitLab


From 02658c2dfed40acaf04c8d2470b3493e8fead523 Mon Sep 17 00:00:00 2001
From: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com>
Date: Wed, 4 Jun 2025 21:24:18 +0800
Subject: [PATCH 168/274] Add DeepSeek-R1-0528 function call chat template
 (#18874)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
---
 docs/features/tool_calling.md                |  6 +-
 examples/tool_chat_template_deepseekr1.jinja | 92 ++++++++++++++++++++
 2 files changed, 96 insertions(+), 2 deletions(-)
 create mode 100644 examples/tool_chat_template_deepseekr1.jinja

diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 6ee1060dd..3547069f7 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -238,9 +238,11 @@ Flags: `--tool-call-parser hermes`
 ### DeepSeek-V3 Models (`deepseek_v3`)
 
 Supported models:
-* `deepseek-ai/DeepSeek-V3-0324`
 
-Flags: `--tool-call-parser deepseek_v3 --chat-template examples/tool_chat_template_deepseekv3.jinja`
+* `deepseek-ai/DeepSeek-V3-0324` (use with <gh-file:examples/tool_chat_template_deepseekv3.jinja>)
+* `deepseek-ai/DeepSeek-R1-0528` (use with <gh-file:examples/tool_chat_template_deepseekr1.jinja>)
+
+Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`
 
 ### Models with Pythonic Tool Calls (`pythonic`)
 
diff --git a/examples/tool_chat_template_deepseekr1.jinja b/examples/tool_chat_template_deepseekr1.jinja
new file mode 100644
index 000000000..9ae19341f
--- /dev/null
+++ b/examples/tool_chat_template_deepseekr1.jinja
@@ -0,0 +1,92 @@
+{% if not add_generation_prompt is defined %}
+    {% set add_generation_prompt = false %}
+{% endif %}
+{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true, is_last_user=false) %}
+{%- for message in messages %}
+    {%- if message['role'] == 'system' %}
+        {%- if ns.is_first_sp %}
+            {% set ns.system_prompt = ns.system_prompt + message['content'] %}
+            {% set ns.is_first_sp = false %}
+        {%- else %}
+            {% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+
+{#- Adapted from https://github.com/sgl-project/sglang/blob/main/examples/chat_template/tool_chat_template_deepseekr1.jinja #}
+{% if tools is defined and tools is not none %}
+    {% set tool_ns = namespace(text='You are a helpful assistant with tool calling capabilities. '
+        'When a tool call is needed, you MUST use the following format to issue the call:\n'
+        '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>FUNCTION_NAME\n'
+        '```json\n{"param1": "value1", "param2": "value2"}\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜>\n\n'
+        'Make sure the JSON is valid.'
+        '## Tools\n\n### Function\n\nYou have the following functions available:\n\n') %}
+    {% for tool in tools %}
+        {% set tool_ns.text = tool_ns.text + '\n```json\n' + (tool | tojson) + '\n```\n' %}
+    {% endfor %}
+    {% set ns.system_prompt = ns.system_prompt + '\n\n' + tool_ns.text %}
+{% endif %}
+
+{{ bos_token }}
+{{ ns.system_prompt }}
+{%- for message in messages %}
+    {% set content = message['content'] %}
+    {%- if message['role'] == 'user' %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_first = false -%}
+        {%- set ns.is_last_user = true -%}
+        {{'<｜User｜>' + content + '<｜Assistant｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' %}
+        {% if '</think>' in content %}
+            {% set content = content.split('</think>')[-1] %}
+        {% endif %}
+    {% endif %}
+    {%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{'<｜tool▁outputs▁end｜>'}}
+        {%- endif %}
+        {%- set ns.is_first = false %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_output_first = true %}
+        {%- for tool in message['tool_calls'] %}
+            {%- if not ns.is_first %}
+                {%- if content is none %}
+                    {{'<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- else %}
+                    {{content + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- endif %}
+                {%- set ns.is_first = true -%}
+            {%- else %}
+                {{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+            {%- endif %}
+        {%- endfor %}
+        {{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none)%}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{'<｜tool▁outputs▁end｜>' + content + '<｜end▁of▁sentence｜>'}}
+            {%- set ns.is_tool = false -%}
+        {%- else %}
+            {{content + '<｜end▁of▁sentence｜>'}}
+        {%- endif %}
+    {%- endif %}
+    {%- if message['role'] == 'tool' %}
+        {%- set ns.is_last_user = false -%}
+        {%- set ns.is_tool = true -%}
+        {%- if ns.is_output_first %}
+            {{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + content + '<｜tool▁output▁end｜>'}}
+            {%- set ns.is_output_first = false %}
+        {%- else %}
+            {{'\n<｜tool▁output▁begin｜>' + content + '<｜tool▁output▁end｜>'}}
+        {%- endif %}
+    {%- endif %}
+{%- endfor -%}
+{% if ns.is_tool %}
+    {{'<｜tool▁outputs▁end｜>'}}
+{% endif %}
+{% if add_generation_prompt and not ns.is_last_user and not ns.is_tool %}
+    {{'<｜Assistant｜>'}}
+{% endif %}
-- 
GitLab


From 5f2cd251d212eed3052c5406875eb26811335d3e Mon Sep 17 00:00:00 2001
From: Lain <fusiyuan2000@hotmail.com>
Date: Wed, 4 Jun 2025 07:48:45 -0700
Subject: [PATCH 169/274] Sm100 blockwise fp8 swap ab (#18564)

---
 .../c3x/scaled_mm_blockwise_sm100_fp8.cu      |   4 -
 ...scaled_mm_blockwise_sm100_fp8_dispatch.cuh | 206 ++++++++++++------
 .../layers/quantization/utils/fp8_utils.py    |  14 --
 3 files changed, 140 insertions(+), 84 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
index 84492553c..4a8a5ed02 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
@@ -9,10 +9,6 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
                                            torch::Tensor const& b,
                                            torch::Tensor const& a_scales,
                                            torch::Tensor const& b_scales) {
-  TORCH_CHECK(
-      a.size(0) % 4 == 0,
-      "Input tensor must have a number of rows that is a multiple of 4. ",
-      "but got: ", a.size(0), " rows.");
   if (out.dtype() == torch::kBFloat16) {
     cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
         out, a, b, a_scales, b_scales);
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
index ef324364c..c841125db 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "cuda_utils.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 
@@ -22,49 +23,49 @@ namespace vllm {
 
 using namespace cute;
 
-template <typename OutType, typename MmaTileShape, typename ScalesPerTile,
-          class ClusterShape, typename EpilogueScheduler,
-          typename MainloopScheduler>
+// clang-format off
+template <class OutType, int ScaleGranularityM,
+          int ScaleGranularityN, int ScaleGranularityK,
+          class MmaTileShape, class ClusterShape,
+          class EpilogueScheduler, class MainloopScheduler,
+          bool swap_ab_ = false>
 struct cutlass_3x_gemm_fp8_blockwise {
+  static constexpr bool swap_ab = swap_ab_;
   using ElementAB = cutlass::float_e4m3_t;
 
   using ElementA = ElementAB;
   using LayoutA = cutlass::layout::RowMajor;
+  using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
   static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
 
   using ElementB = ElementAB;
   using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
   static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
 
-  using ElementC = void;
   using ElementD = OutType;
   using LayoutD = cutlass::layout::RowMajor;
+  using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
   static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
 
+  using ElementC = void; // TODO: support bias
   using LayoutC = LayoutD;
+  using LayoutC_Transpose = LayoutD_Transpose;
   static constexpr int AlignmentC = AlignmentD;
 
   using ElementAccumulator = float;
   using ElementCompute = float;
   using ElementBlockScale = float;
 
-  // MMA and Cluster Tile Shapes
-  // Shape of the tile computed by tcgen05 MMA, could be across 2 SMs if Cluster
-  // Shape %2 == 0 using MmaTileShape_MNK = Shape<_128,_128,_128>;
-  static constexpr int ScaleMsPerTile = size<0>(ScalesPerTile{});
-  static constexpr int ScaleGranularityM =
-      size<0>(MmaTileShape{}) / ScaleMsPerTile;
-  static constexpr int ScaleGranularityN =
-      size<1>(MmaTileShape{}) / size<1>(ScalesPerTile{});
-  static constexpr int ScaleGranularityK =
-      size<2>(MmaTileShape{}) / size<2>(ScalesPerTile{});
-
-  // Shape of the threadblocks in a cluster
-  using ClusterShape_MNK = ClusterShape;
-
-  using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
-      ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
-      cute::UMMA::Major::MN, cute::UMMA::Major::K>;
+  using ScaleConfig = conditional_t<swap_ab,
+      cutlass::detail::Sm100BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::K, cute::UMMA::Major::MN>,
+      cutlass::detail::Sm100BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::MN, cute::UMMA::Major::K>>;
+
+  // layout_SFA and layout_SFB cannot be swapped since they are deduced.
   using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
   using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
 
@@ -73,7 +74,6 @@ struct cutlass_3x_gemm_fp8_blockwise {
 
   static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
   using ElementScalar = float;
-  // clang-format off
   using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       ArchTag,
@@ -84,33 +84,47 @@ struct cutlass_3x_gemm_fp8_blockwise {
       ElementAccumulator,
       ElementCompute,
       ElementC,
-      LayoutC,
+      conditional_t<swap_ab, LayoutC_Transpose, LayoutC>,
       AlignmentC,
       ElementD,
-      LayoutD,
+      conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
       AlignmentD,
       EpilogueScheduler,
       DefaultOperation
   >::CollectiveOp;
  
   using StageCountType = cutlass::gemm::collective::StageCountAuto; 
-  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
-      ArchTag,
-      OperatorClass,
-      ElementA,
-      cute::tuple<LayoutA, LayoutSFA>,
-      AlignmentA,
-      ElementB,
-      cute::tuple<LayoutB, LayoutSFB>,
-      AlignmentB,
-      ElementAccumulator,
-      MmaTileShape,
-      ClusterShape,
-
+  using CollectiveMainloop = conditional_t<swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementB,
+          cute::tuple<LayoutB_Transpose, LayoutSFA>,
+          AlignmentB,
+          ElementA,
+          cute::tuple<LayoutA_Transpose, LayoutSFB>,
+          AlignmentA,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
           cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
-      MainloopScheduler
-  >::CollectiveOp;
-  // clang-format on
+          MainloopScheduler
+      >::CollectiveOp,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementA,
+          cute::tuple<LayoutA, LayoutSFA>,
+          AlignmentA,
+          ElementB,
+          cute::tuple<LayoutB, LayoutSFB>,
+          AlignmentB,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          MainloopScheduler
+      >::CollectiveOp>;
 
   using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
       Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
@@ -123,6 +137,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
                                    torch::Tensor const& b,
                                    torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
   using GemmKernel = typename Gemm::GemmKernel;
   using StrideA = typename Gemm::GemmKernel::StrideA;
   using StrideB = typename Gemm::GemmKernel::StrideB;
@@ -136,7 +151,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
   using ElementD = typename Gemm::ElementD;
 
   int32_t m = a.size(0), n = b.size(1), k = a.size(1);
-  auto prob_shape = cute::make_shape(m, n, k, 1);
 
   StrideA a_stride;
   StrideB b_stride;
@@ -146,11 +160,13 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
   b_stride =
       cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
   c_stride =
-      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
+      cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
 
-  LayoutSFA layout_SFA =
+  LayoutSFA layout_SFA = swap_ab ? 
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) :
       ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
-  LayoutSFB layout_SFB =
+  LayoutSFB layout_SFB = swap_ab ?
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
       ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
 
   auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
@@ -158,9 +174,22 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
   auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
   auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
 
-  typename GemmKernel::MainloopArguments mainloop_args{
-      a_ptr,        a_stride,   b_ptr,        b_stride,
-      a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB};
+  auto mainloop_args = [&](){
+    // layout_SFA and layout_SFB cannot be swapped since they are deduced.
+    if (swap_ab) {
+      return typename GemmKernel::MainloopArguments{
+          b_ptr,        b_stride,   a_ptr,        a_stride,
+          b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
+      };
+    }
+    else {
+      return typename GemmKernel::MainloopArguments{
+          a_ptr,        a_stride,   b_ptr,        b_stride,
+          a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
+      };
+    }
+  }();
+  auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
 
   auto c_ptr = static_cast<ElementD*>(out.data_ptr());
   typename GemmKernel::EpilogueArguments epilogue_args{
@@ -175,29 +204,74 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
                                                torch::Tensor const& b,
                                                torch::Tensor const& a_scales,
                                                torch::Tensor const& b_scales) {
-  auto m = a.size(0);
-  auto k = a.size(1);
-  auto n = b.size(1);
-  int sms;
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms;
   cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
 
-  auto should_use_2sm = [&sms](int m, int n, int tile1SM = 128) {
-    return std::ceil(static_cast<float>(m) / tile1SM) *
-               std::ceil(static_cast<float>(n) / tile1SM) >=
-           sms;
-  };
-  bool use_2sm = should_use_2sm(m, n);
-  if (use_2sm) {
-    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-        OutType, Shape<_256, _128, _128>, Shape<_256, _1, _1>,
-        Shape<_2, _2, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
-        cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
-        out, a, b, a_scales, b_scales);
+  constexpr int TILE_K = 128;
+  // TODO: better heuristics
+  bool swap_ab = (m < 16) || (m % 4 != 0);
+  bool use_tma_epilogue = (m * n) % 4 == 0;
+  if (!swap_ab) {
+    constexpr int TILE_N = 128;
+    int tile_m = 256;
+    if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) {
+      tile_m = 64;
+    }
+    else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) {
+      tile_m = 128;
+    }
+    if (tile_m == 64) {
+      if (use_tma_epilogue) {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    } else if (tile_m == 128) {
+      if (use_tma_epilogue) {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    } else { // tile_m == 256
+      if (use_tma_epilogue) {
+          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    }
   } else {
+    // TODO: Test more tile N configs
+    constexpr int TILE_M = 128;
+    constexpr int TILE_N = 16;
+    // TMA epilogue isn't compatible with Swap A/B
     cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-        OutType, Shape<_128, _128, _128>, Shape<_128, _1, _1>,
-        Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
-        cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+        OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
+        Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+        cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
         out, a, b, a_scales, b_scales);
   }
 }
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 1ebd2a898..270979c8e 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -136,24 +136,10 @@ def apply_w8a8_block_fp8_linear(
         use_cutlass, use_aiter_and_is_supported)
 
     if use_cutlass:
-        rows, cols = input_2d.shape
-        # Blackwell GPUs (SM100) require row dimensions to be multiple of 4 for
-        # optimal tensor core usage. Can be removed when targeting platforms
-        # without this constraint.
-        should_pad = current_platform.has_device_capability(
-            100) and rows % 4 != 0
-        if should_pad:
-            input_2d = torch.nn.functional.pad(input_2d,
-                                               (0, 0, 0, 4 - (rows % 4)),
-                                               value=0).contiguous()
-
         q_input, x_scale = per_token_group_quant_fp8(
             input_2d, block_size[1], column_major_scales=use_cutlass)
-
         output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale,
                                       block_size, input.dtype)
-        if should_pad:
-            output = output[:rows, :]
 
     else:
         q_input, x_scale = per_token_group_quant_fp8(
-- 
GitLab


From 8f4ffbd373cb19e8f8dcfa6dec1dbbe98fbeae96 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 4 Jun 2025 22:57:55 +0800
Subject: [PATCH 170/274] [Doc] Update V1 Guide for embedding models (#19141)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/usage/v1_guide.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 7c4909cb5..baeb5411b 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -55,7 +55,7 @@ This living user guide outlines a few known **important changes and limitations*
 | **Spec Decode**                             | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
 | **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
 | **Structured Output Alternative Backends**  | <nobr>🟡 Planned</nobr>                                                           |
-| **Embedding Models**                        | <nobr>🚧 WIP ([PR #18015](https://github.com/vllm-project/vllm/pull/18015))</nobr> |
+| **Embedding Models**                        | <nobr>🚧 WIP ([PR #16188](https://github.com/vllm-project/vllm/pull/16188))</nobr> |
 | **Mamba Models**                            | <nobr>🟡 Planned</nobr>                                                           |
 | **Encoder-Decoder Models**                  | <nobr>🟠 Delayed</nobr>                                                           |
 | **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
@@ -145,9 +145,9 @@ vLLM V1 currently excludes model architectures with the `SupportsV0Only` protoco
 and the majority fall into the following categories. V1 support for these models will be added eventually.
 
 **Embedding Models**  
-Initially, we will create a [separate model runner](https://github.com/vllm-project/vllm/pull/18015) to provide V1 support without conflicting with other ongoing work.
+The initial support will be provided by [PR #16188](https://github.com/vllm-project/vllm/pull/16188).
 
-Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) to enable simultaneous generation and embedding using the same engine instance in V1. [PR #16188](https://github.com/vllm-project/vllm/pull/16188) is the first step towards enabling this.
+Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) to enable simultaneous generation and embedding using the same engine instance in V1.
 
 **Mamba Models**  
 Models using selective state-space mechanisms (instead of standard transformer attention)
-- 
GitLab


From c8dcc159214a20650451dcd64b226f56671763f1 Mon Sep 17 00:00:00 2001
From: jmswen <jmswen@users.noreply.github.com>
Date: Wed, 4 Jun 2025 08:26:47 -0700
Subject: [PATCH 171/274] Allow AsyncLLMEngine.generate to target a specific DP
 rank (#19102)

Signed-off-by: Jon Swenson <jmswen@gmail.com>
---
 .../multi_instance_data_parallel.py           | 58 +++++++++++++++++++
 tests/tokenization/test_detokenize.py         |  3 +-
 tests/v1/engine/test_engine_core.py           |  1 +
 tests/v1/engine/test_engine_core_client.py    |  1 +
 tests/v1/engine/test_output_processor.py      |  5 ++
 vllm/engine/async_llm_engine.py               | 12 +++-
 vllm/v1/engine/__init__.py                    |  1 +
 vllm/v1/engine/async_llm.py                   |  5 +-
 vllm/v1/engine/core_client.py                 | 14 ++++-
 vllm/v1/engine/processor.py                   |  2 +
 10 files changed, 97 insertions(+), 5 deletions(-)
 create mode 100644 examples/online_serving/multi_instance_data_parallel.py

diff --git a/examples/online_serving/multi_instance_data_parallel.py b/examples/online_serving/multi_instance_data_parallel.py
new file mode 100644
index 000000000..62b1ec71a
--- /dev/null
+++ b/examples/online_serving/multi_instance_data_parallel.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+from typing import Optional
+
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+
+"""
+To run this example, run the following commands simultaneously with
+different CUDA_VISIBLE_DEVICES:
+    python examples/online_serving/multi_instance_data_parallel.py
+
+    vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \
+        --data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \
+        --data-parallel-size-local 1 --enforce-eager --headless
+
+Once both instances have completed the handshake, this example will
+send a request to the instance with DP rank 1.
+"""
+
+
+async def main():
+    engine_args = AsyncEngineArgs(
+        model="ibm-research/PowerMoE-3b",
+        data_parallel_size=2,
+        dtype="auto",
+        max_model_len=2048,
+        data_parallel_address="127.0.0.1",
+        data_parallel_rpc_port=62300,
+        data_parallel_size_local=1,
+        enforce_eager=True,
+    )
+
+    engine_client = AsyncLLMEngine.from_engine_args(engine_args)
+
+    sampling_params = SamplingParams(
+        temperature=0.7,
+        top_p=0.9,
+        max_tokens=100,
+    )
+
+    prompt = "Who won the 2004 World Series?"
+    final_output: Optional[RequestOutput] = None
+    async for output in engine_client.generate(
+        prompt=prompt,
+        sampling_params=sampling_params,
+        request_id="abcdef",
+        data_parallel_rank=1,
+    ):
+        final_output = output
+    if final_output:
+        print(final_output.outputs[0].text)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index b289dc972..9f2414eca 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -70,7 +70,8 @@ def _run_incremental_decode(tokenizer,
                                 None,
                                 0.0,
                                 None,
-                                cache_salt=None)
+                                cache_salt=None,
+                                data_parallel_rank=None)
 
     if fast is None:
         detokenizer = IncrementalDetokenizer.from_new_request(
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 3d7632a60..1cbbf3037 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -42,6 +42,7 @@ def make_request() -> EngineCoreRequest:
         arrival_time=time.time(),
         lora_request=None,
         cache_salt=None,
+        data_parallel_rank=None,
     )
 
 
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 47181d36f..c2dc3b473 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -56,6 +56,7 @@ def make_request(
         arrival_time=time.time(),
         lora_request=None,
         cache_salt=None,
+        data_parallel_rank=None,
     )
 
 
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index a83454ee6..6b88b0cf1 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -59,6 +59,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
                           eos_token_id=None,
                           lora_request=None,
                           cache_salt=None,
+                          data_parallel_rank=None,
                           sampling_params=SamplingParams(
                               skip_special_tokens=False,
                               spaces_between_special_tokens=False,
@@ -406,6 +407,7 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
                           eos_token_id=None,
                           lora_request=None,
                           cache_salt=None,
+                          data_parallel_rank=None,
                           sampling_params=SamplingParams(
                               skip_special_tokens=False,
                               spaces_between_special_tokens=False,
@@ -569,6 +571,7 @@ def test_stop_token(include_stop_str_in_output: bool,
         eos_token_id=eos_token_id,
         lora_request=None,
         cache_salt=None,
+        data_parallel_rank=None,
         sampling_params=SamplingParams(
             skip_special_tokens=False,
             spaces_between_special_tokens=False,
@@ -666,6 +669,7 @@ def test_stop_string(include_stop_str_in_output: bool,
             eos_token_id=None,
             lora_request=None,
             cache_salt=None,
+            data_parallel_rank=None,
             sampling_params=SamplingParams(
                 skip_special_tokens=False,
                 spaces_between_special_tokens=False,
@@ -780,6 +784,7 @@ def test_iteration_stats(dummy_test_vectors):
             eos_token_id=None,
             lora_request=None,
             cache_salt=None,
+            data_parallel_rank=None,
             sampling_params=SamplingParams(),
         ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 6d8d97cf5..59971f5d6 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -442,6 +442,7 @@ class _AsyncLLMEngine(LLMEngine):
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> None:
         ...
 
@@ -456,6 +457,7 @@ class _AsyncLLMEngine(LLMEngine):
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> None:
         ...
 
@@ -473,6 +475,7 @@ class _AsyncLLMEngine(LLMEngine):
             trace_headers: Optional[Mapping[str, str]] = None,
             prompt_adapter_request: Optional[PromptAdapterRequest] = None,
             priority: int = 0,
+            data_parallel_rank: Optional[int] = None,
             *,
             inputs: Optional[PromptType] = None,  # DEPRECATED
     ) -> None:
@@ -902,6 +905,7 @@ class AsyncLLMEngine(EngineClient):
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> Coroutine[None, None, AsyncGenerator[Union[
             RequestOutput, PoolingRequestOutput], None]]:
         ...
@@ -917,6 +921,7 @@ class AsyncLLMEngine(EngineClient):
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> Coroutine[None, None, AsyncGenerator[Union[
             RequestOutput, PoolingRequestOutput], None]]:
         ...
@@ -935,6 +940,7 @@ class AsyncLLMEngine(EngineClient):
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
         *,
         inputs: Optional[PromptType] = None,  # DEPRECATED
     ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
@@ -967,6 +973,7 @@ class AsyncLLMEngine(EngineClient):
             trace_headers=trace_headers,
             prompt_adapter_request=prompt_adapter_request,
             priority=priority,
+            data_parallel_rank=data_parallel_rank,
         )
 
         return stream.generator()
@@ -980,6 +987,7 @@ class AsyncLLMEngine(EngineClient):
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         """Generate outputs for a request.
 
@@ -999,7 +1007,8 @@ class AsyncLLMEngine(EngineClient):
                                             for generation, if any.
             priority: The priority of the request.
                 Only applicable with priority scheduling.
-
+            data_parallel_rank: The (global) data parallel rank that must
+                handle this request. Only applicable if DP is enabled.
         Yields:
             The output `RequestOutput` objects from the LLMEngine
             for the request.
@@ -1057,6 +1066,7 @@ class AsyncLLMEngine(EngineClient):
                     trace_headers=trace_headers,
                     prompt_adapter_request=prompt_adapter_request,
                     priority=priority,
+                    data_parallel_rank=data_parallel_rank,
             ):
                 yield LLMEngine.validate_output(output, RequestOutput)
         except asyncio.CancelledError:
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index d1bec2523..59463f1ba 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -55,6 +55,7 @@ class EngineCoreRequest(
     arrival_time: float
     lora_request: Optional[LoRARequest]
     cache_salt: Optional[str]
+    data_parallel_rank: Optional[int]
 
     # Index of the client, used to ensure outputs are sent back to the same
     # client for this request when scaling out the front-end.
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 0e3696321..61ea3c4c3 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -229,6 +229,7 @@ class AsyncLLM(EngineClient):
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> RequestOutputCollector:
         """Add new request to the AsyncLLM."""
 
@@ -245,7 +246,7 @@ class AsyncLLM(EngineClient):
         prompt_str, request = self.processor.process_inputs(
             request_id, prompt, params, arrival_time, lora_request,
             tokenization_kwargs, trace_headers, prompt_adapter_request,
-            priority)
+            priority, data_parallel_rank)
 
         if params.n == 1:
             await self._add_request(request, prompt_str, None, 0, queue)
@@ -291,6 +292,7 @@ class AsyncLLM(EngineClient):
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         """
         Main function called by the API server to kick off a request
@@ -321,6 +323,7 @@ class AsyncLLM(EngineClient):
                 trace_headers=trace_headers,
                 prompt_adapter_request=prompt_adapter_request,
                 priority=priority,
+                data_parallel_rank=data_parallel_rank,
             )
 
             # The output_handler task pushes items into the queue.
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index adb0709c8..0cd58d01d 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -982,7 +982,16 @@ class DPAsyncMPClient(AsyncMPClient):
         resources.stats_update_task = asyncio.create_task(
             run_engine_stats_update_task())
 
-    def get_core_engine_for_request(self) -> CoreEngine:
+    def get_core_engine_for_request(self,
+                                    dp_rank: Optional[int] = None
+                                    ) -> CoreEngine:
+        if dp_rank is not None:
+            # engines are already in rank order
+            if dp_rank < 0 or dp_rank >= len(self.core_engines):
+                raise ValueError(f"Requested DP rank {dp_rank} is out of "
+                                 f"range [0, {len(self.core_engines)})")
+            return self.core_engines[dp_rank]
+
         if not self.lb_engines:
             return self.core_engines[0]
         # TODO use P2C alg for larger DP sizes
@@ -1018,7 +1027,8 @@ class DPAsyncMPClient(AsyncMPClient):
         request.current_wave = self.current_wave
         request.client_index = self.client_index
 
-        chosen_engine = self.get_core_engine_for_request()
+        chosen_engine = self.get_core_engine_for_request(
+            request.data_parallel_rank)
         self.reqs_in_flight[request.request_id] = chosen_engine
 
         to_await = self._send_input(EngineCoreRequestType.ADD, request,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 5c0d01d9b..546fc98d6 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -212,6 +212,7 @@ class Processor:
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> tuple[Optional[str], EngineCoreRequest]:
 
         # TODO(woosuk): Support pooling models.
@@ -328,6 +329,7 @@ class Processor:
             arrival_time=arrival_time,
             lora_request=lora_request,
             cache_salt=decoder_inputs.get("cache_salt"),
+            data_parallel_rank=data_parallel_rank,
         )
 
     def _validate_model_inputs(self,
-- 
GitLab


From d459fae0a2c464e28680bc6d564c1de1b295029e Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 4 Jun 2025 11:39:23 -0400
Subject: [PATCH 172/274] [Bugfix][EP+DP] Fix internode check (#19112)

Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
---
 vllm/distributed/device_communicators/all2all.py            | 6 ------
 .../device_communicators/base_device_communicator.py        | 3 +--
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index 2ab3779ec..cab2496bf 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -84,10 +84,6 @@ class PPLXAll2AllManager(All2AllManagerBase):
         assert has_pplx, "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels."  # noqa
         super().__init__(cpu_group)
 
-        # TODO(tms): Disable pplx-a2a intranode as it fails with the error:
-        # failed: cuda error /app/pplx/csrc/all_to_all/intranode.cpp:84 'invalid resource handle' # noqa
-        self.internode = True
-
         if self.internode:
             # inter-node communication needs nvshmem,
             # intra-node communication uses p2p mapping directly
@@ -178,7 +174,6 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
             num_rdma_bytes = 1024 * 1024 * 1024
             num_qps_per_rank = self.num_sms // 2
         else:
-            assert self.intranode
             num_rdma_bytes = 0
             num_qps_per_rank = 1
 
@@ -243,7 +238,6 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
         if self.internode:
             num_rdma_bytes = 1024 * 1024 * 1024
         else:
-            assert self.intranode
             num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
                 num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
                 hidden=token_hidden_size,
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index 38370d4dc..1bc2d8e02 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -49,8 +49,7 @@ class All2AllManagerBase:
 
         # all2all communication often has separate implementations for
         # intra-node and inter-node communication
-        self.intranode = in_the_same_node_as(cpu_group, source_rank=0)
-        self.internode = not self.intranode
+        self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0))
 
     def get_handle(self, kwargs):
         # get a handle for the all2all communication,
-- 
GitLab


From 53a5a0ce30dd623808ebd02947e5183f918b6c2f Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 4 Jun 2025 13:46:28 -0400
Subject: [PATCH 173/274] [Perf] Tunings for SM100 FP8 CUTLASS kernel (#18778)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../c3x/scaled_mm_sm100_fp8_dispatch.cuh      | 53 ++++++++++++++++++-
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
index 468b77d95..6da2da634 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -15,6 +15,7 @@ using c3x::cutlass_gemm_caller;
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm100_fp8_config_default {
+  // M in (128, inf)
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
   using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
@@ -25,6 +26,34 @@ struct sm100_fp8_config_default {
                             KernelSchedule, EpilogueSchedule>;
 };
 
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _128, _64>;
+  using ClusterShape = Shape<_2, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
@@ -39,8 +68,28 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
   using Cutlass3xGemmDefault =
       typename sm100_fp8_config_default<InType, OutType,
                                         Epilogue>::Cutlass3xGemm;
-  return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-      out, a, b, std::forward<EpilogueArgs>(args)...);
+  using Cutlass3xGemmM64 =
+      typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    // m in [1, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
 }
 
 template <template <typename, typename, typename> typename Epilogue,
-- 
GitLab


From 7ee2590478e088e8df56652ba2607cecb5123b6b Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Wed, 4 Jun 2025 13:13:43 -0700
Subject: [PATCH 174/274] [TPU] Update dynamo dump file name in compilation
 test (#19108)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
---
 .../scripts/hardware_ci/run-tpu-v1-test.sh    |  2 +-
 tests/tpu/test_compilation.py                 |  5 ++--
 tests/v1/tpu/worker/test_tpu_model_runner.py  | 25 +++++++++++--------
 3 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index a394046d2..a2a5c2a02 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -150,7 +150,7 @@ run_and_track_test 9 "test_multimodal.py" \
 run_and_track_test 10 "test_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
 run_and_track_test 11 "test_struct_output_generate.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k 'not test_structured_output_with_reasoning_matrices'"
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
 run_and_track_test 12 "test_moe_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 run_and_track_test 13 "test_lora.py" \
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index 3a180c679..448b8b2bc 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -64,9 +64,10 @@ def test_tpu_compilation():
         numbers = [int(part) for part in parts if part.isdigit()]
         return numbers[0]
 
-    # Check all the compilations are as expected
+    # Check all the compilations are as expected. The dump files include the
+    # captured graph for the forward function of the nn.Module.
     compiled_fns = sorted(glob.glob(
-        os.path.join(temp_dir, "__compiled_fn*Captured*.py")),
+        os.path.join(temp_dir, "__compiled_fn*Forward_graph*.py")),
                           key=lambda s: extract_compiled_index(s))
 
     for i, compiled_fn in enumerate(compiled_fns):
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index bc54b6ecc..e351f0e92 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -370,6 +370,7 @@ def test_get_req_paddings():
     assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
 
 
+@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
 def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
@@ -381,7 +382,7 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
             layer_0:
             Attention(
                 num_heads=8,
-                head_size=64,
+                head_size=128,
                 scale=1.0,
                 prefix=layer_0,
                 kv_sharing_target_layer_name=layer_1,
@@ -389,7 +390,7 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
             layer_1:
             Attention(
                 num_heads=8,
-                head_size=64,
+                head_size=128,
                 scale=1.0,
                 prefix=layer_1,
             )
@@ -398,6 +399,7 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
         assert fwd_context is not None
 
 
+@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
 def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
@@ -408,14 +410,14 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
             layer_0:
             Attention(
                 num_heads=8,
-                head_size=64,
+                head_size=128,
                 scale=1.0,
                 prefix=layer_0,
             ),
             layer_1:
             Attention(
                 num_heads=8,
-                head_size=64,
+                head_size=128,
                 scale=1.0,
                 prefix=layer_1,
                 # invalid layer: cross_attn.atn doesn't exist!
@@ -426,6 +428,7 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
         assert fwd_context is not None
 
 
+@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
 def test_init_kv_cache_with_kv_sharing_target_same_as_current():
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
@@ -437,14 +440,14 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():
             layer_0:
             Attention(
                 num_heads=8,
-                head_size=64,
+                head_size=128,
                 scale=1.0,
                 prefix=layer_0,
             ),
             layer_1:
             Attention(
                 num_heads=8,
-                head_size=64,
+                head_size=128,
                 scale=1.0,
                 prefix=layer_1,
                 kv_sharing_target_layer_name=layer_1,
@@ -454,6 +457,7 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():
         assert fwd_context is not None
 
 
+@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
 def test_init_kv_cache_without_kv_sharing(model_runner):
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
@@ -463,14 +467,14 @@ def test_init_kv_cache_without_kv_sharing(model_runner):
             layer_0:
             Attention(
                 num_heads=8,
-                head_size=64,
+                head_size=128,
                 scale=1.0,
                 prefix=layer_0,
             ),
             layer_1:
             Attention(
                 num_heads=8,
-                head_size=64,
+                head_size=128,
                 scale=1.0,
                 prefix=layer_1,
             )
@@ -520,6 +524,7 @@ def test_init_kv_cache_without_kv_sharing(model_runner):
     assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
 
 
+@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
 def test_init_kv_cache_with_kv_sharing_valid(model_runner):
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
@@ -529,14 +534,14 @@ def test_init_kv_cache_with_kv_sharing_valid(model_runner):
             layer_0:
             Attention(
                 num_heads=8,
-                head_size=64,
+                head_size=128,
                 scale=1.0,
                 prefix=layer_0,
             ),
             layer_1:
             Attention(
                 num_heads=8,
-                head_size=64,
+                head_size=128,
                 scale=1.0,
                 prefix=layer_1,
                 kv_sharing_target_layer_name="model.layers.0.self_attn.attn",
-- 
GitLab


From ef3f98b59fb971291a8cf3d0530ab7822f39d959 Mon Sep 17 00:00:00 2001
From: Kebe <mail@kebe7jun.com>
Date: Thu, 5 Jun 2025 04:17:38 +0800
Subject: [PATCH 175/274] [Bugfix] fix v1 cpu worker fails on macOS (#19121)

---
 vllm/engine/arg_utils.py |  5 +++--
 vllm/platforms/cpu.py    | 12 +++++++++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 901346831..1ba50fec9 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1290,7 +1290,7 @@ class EngineArgs:
         # Skip this check if we are running on a non-GPU platform,
         # or if the device capability is not available
         # (e.g. in a Ray actor without GPUs).
-        from vllm.platforms import current_platform
+        from vllm.platforms import CpuArchEnum, current_platform
         if (current_platform.is_cuda()
                 and current_platform.get_device_capability()
                 and current_platform.get_device_capability().major < 8):
@@ -1434,7 +1434,8 @@ class EngineArgs:
         # Non-[CUDA, TPU] may be supported on V1, but off by default for now.
         v0_hardware = not any(
             (current_platform.is_cuda(), current_platform.is_tpu(),
-             current_platform.is_cpu()))
+             (current_platform.is_cpu()
+              and current_platform.get_cpu_architecture() == CpuArchEnum.X86)))
         if v0_hardware and _warn_or_fallback(  # noqa: SIM103
                 current_platform.device_name):
             return False
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 265959d62..71c964fbf 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
+import platform
 import sys
 from importlib.util import find_spec
 from typing import TYPE_CHECKING, Optional
@@ -22,6 +23,15 @@ else:
     VllmConfig = None
 
 
+def get_max_threads(pid=0):
+    if hasattr(os, 'sched_getaffinity'):
+        return len(os.sched_getaffinity(pid))
+    elif platform.system() == 'Darwin':
+        return os.cpu_count()
+    else:
+        raise NotImplementedError("Unsupported OS")
+
+
 class CpuPlatform(Platform):
     _enum = PlatformEnum.CPU
     device_name: str = "cpu"
@@ -190,7 +200,7 @@ class CpuPlatform(Platform):
 
         # Note: to avoid the error 'nthreads cannot be larger than environment
         #  variable "NUMEXPR_MAX_THREADS" (64)'.
-        os.environ["NUMEXPR_MAX_THREADS"] = str(len(os.sched_getaffinity(0)))
+        os.environ["NUMEXPR_MAX_THREADS"] = str(get_max_threads())
 
         # Set default threads num for OpenMP parallel
         os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())
-- 
GitLab


From c3fd4d669a4446578ea83bf7c330112339d2d8e1 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Wed, 4 Jun 2025 17:59:18 -0400
Subject: [PATCH 176/274] [Kernel] Integrate batched/masked deepgemm kernel
 (#19111)

Signed-off-by: Varun <vsundarr@redhat.com>
Co-authored-by: Varun <vsundarr@redhat.com>
---
 tests/kernels/moe/deepep_utils.py             |   5 +-
 tests/kernels/moe/test_deepep_deepgemm_moe.py | 190 +++++++++++++++---
 .../layers/fused_moe/batched_deep_gemm_moe.py | 124 ++++++++++++
 .../batched_triton_or_deep_gemm_moe.py        | 116 +++++++++++
 .../fused_moe/deepep_ll_prepare_finalize.py   |  76 +++++--
 .../model_executor/layers/quantization/fp8.py |  12 +-
 6 files changed, 472 insertions(+), 51 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
 create mode 100644 vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py

diff --git a/tests/kernels/moe/deepep_utils.py b/tests/kernels/moe/deepep_utils.py
index 2bc9b657d..117f1babd 100644
--- a/tests/kernels/moe/deepep_utils.py
+++ b/tests/kernels/moe/deepep_utils.py
@@ -162,12 +162,14 @@ def make_deepep_ll_a2a(pg: ProcessGroup,
                             low_latency_mode=True,
                             num_qps_per_rank=deepep_ll_args.num_experts //
                             pgi.world_size)
+
     return DeepEPLLPrepareAndFinalize(
         buffer=buffer,
         world_size=pgi.world_size,
         dp_size=dp_size,
         max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank,
         quant_dtype=q_dtype,
+        block_shape=block_shape,
         use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch,
     )
 
@@ -185,4 +187,5 @@ def make_deepep_a2a(pg: ProcessGroup,
                                   block_shape)
 
     assert deepep_ll_args is not None
-    return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype)
+    return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype,
+                              block_shape)
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index a1fdc1d5f..2d7cf39a8 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 """
 Test DeepEP + DeepGEMM integration 
+DeepGEMM are gemm kernels specialized for the
+fp8 block-quantized case.
 """
 
 import dataclasses
@@ -33,10 +35,14 @@ except ImportError:
 if has_deep_ep:
     from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
         DeepEPHTPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
+        DeepEPLLPrepareAndFinalize)
 
-    from .deepep_utils import DeepEPHTArgs, make_deepep_a2a
+    from .deepep_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
 
 if has_deep_gemm:
+    from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+        BatchedDeepGemmExperts)
     from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
         DeepGemmExperts)
 
@@ -53,6 +59,13 @@ requires_deep_gemm = pytest.mark.skipif(
 P = ParamSpec("P")
 
 
+def next_power_of_2(x):
+    import math
+    if x == 0:
+        return 1
+    return 2**math.ceil(math.log2(x))
+
+
 def per_block_cast_to_fp8(
         x: torch.Tensor,
         block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
@@ -126,6 +139,9 @@ class TestConfig:
     n: int
     num_experts: int
     block_size: list[int]
+    # configs for testing low-latency kernels
+    low_latency: bool
+    use_fp8_dispatch: Optional[bool] = False
 
 
 @dataclasses.dataclass
@@ -170,9 +186,43 @@ class TestTensors:
                            config=config)
 
 
-def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int,
-                        num_local_experts: int, q_dtype: Optional[torch.dtype],
-                        block_shape: list[int]) -> FusedMoEModularKernel:
+def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                           max_tokens_per_rank: int, dp_size: int,
+                           hidden_size: int, q_dtype: Optional[torch.dtype],
+                           test_config: TestConfig) -> FusedMoEModularKernel:
+
+    assert test_config.low_latency
+    assert test_config.use_fp8_dispatch is not None
+
+    a2a: DeepEPLLPrepareAndFinalize = make_deepep_a2a(
+        pg=pg,
+        pgi=pgi,
+        dp_size=dp_size,
+        deepep_ht_args=None,
+        deepep_ll_args=DeepEPLLArgs(
+            max_tokens_per_rank=max_tokens_per_rank,
+            hidden_size=hidden_size,
+            num_experts=test_config.num_experts,
+            use_fp8_dispatch=test_config.use_fp8_dispatch),
+        q_dtype=q_dtype,
+        block_shape=test_config.block_size)
+
+    fused_experts = BatchedDeepGemmExperts(max_num_tokens=max_tokens_per_rank,
+                                           world_size=pgi.world_size,
+                                           dp_size=dp_size,
+                                           block_shape=test_config.block_size)
+    mk = FusedMoEModularKernel(prepare_finalize=a2a,
+                               fused_experts=fused_experts)
+    return mk
+
+
+def make_ht_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                           dp_size: int, num_local_experts: int,
+                           q_dtype: Optional[torch.dtype],
+                           test_config: TestConfig) -> FusedMoEModularKernel:
+
+    assert not test_config.low_latency
+    assert test_config.use_fp8_dispatch is None
 
     a2a: DeepEPHTPrepareAndFinalize = make_deepep_a2a(
         pg=pg,
@@ -181,7 +231,7 @@ def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int,
         deepep_ht_args=DeepEPHTArgs(num_local_experts=num_local_experts),
         deepep_ll_args=None,
         q_dtype=q_dtype,
-        block_shape=block_shape)
+        block_shape=test_config.block_size)
 
     fused_experts = DeepGemmExperts()
     mk = FusedMoEModularKernel(prepare_finalize=a2a,
@@ -189,12 +239,42 @@ def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int,
     return mk
 
 
-def deep_ep_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int,
-                     test_tensors: TestTensors, w1: torch.Tensor,
-                     w2: torch.Tensor, w1_scale: Optional[torch.Tensor],
-                     w2_scale: Optional[torch.Tensor],
-                     num_experts: int) -> torch.Tensor:
+def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int,
+                        num_local_experts: int,
+                        test_tensors: TestTensors) -> FusedMoEModularKernel:
+
+    q_dtype = torch.float8_e4m3fn
+    test_config = test_tensors.config
+
+    mk: FusedMoEModularKernel
+    # Make modular kernel
+    if test_config.low_latency:
+        max_tokens_per_rank = max(
+            64, next_power_of_2(test_tensors.rank_tokens.size(0)))
+        hidden_size = test_tensors.rank_tokens.size(-1)
+
+        mk = make_ll_modular_kernel(pg=pg,
+                                    pgi=pgi,
+                                    max_tokens_per_rank=max_tokens_per_rank,
+                                    dp_size=dp_size,
+                                    hidden_size=hidden_size,
+                                    q_dtype=q_dtype,
+                                    test_config=test_config)
+    else:
+        mk = make_ht_modular_kernel(pg, pgi, dp_size, num_local_experts,
+                                    q_dtype, test_config)
+
+    return mk
+
+
+def deepep_deepgemm_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                             dp_size: int, test_tensors: TestTensors,
+                             w1: torch.Tensor, w2: torch.Tensor,
+                             w1_scale: Optional[torch.Tensor],
+                             w2_scale: Optional[torch.Tensor]) -> torch.Tensor:
 
+    test_config = test_tensors.config
+    num_experts = test_config.num_experts
     num_local_experts = w1.size(0)
 
     def build_expert_map():
@@ -208,14 +288,17 @@ def deep_ep_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int,
         return expert_map.to(device=torch.cuda.current_device(),
                              dtype=torch.int32)
 
-    q_dtype = torch.float8_e4m3fn
-
     # Make modular kernel
     mk: FusedMoEModularKernel = make_modular_kernel(
-        pg, pgi, dp_size, num_local_experts, q_dtype,
-        test_tensors.config.block_size)
+        pg=pg,
+        pgi=pgi,
+        dp_size=dp_size,
+        num_local_experts=num_local_experts,
+        test_tensors=test_tensors)
 
-    a1_scale = test_tensors.rank_token_scales
+    # Low-Latency kernels can't dispatch scales.
+    a1_scale = (None
+                if test_config.low_latency else test_tensors.rank_token_scales)
 
     out = mk.forward(hidden_states=test_tensors.rank_tokens,
                      w1=w1,
@@ -258,7 +341,7 @@ def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor,
         allow_deep_gemm=False)
 
 
-def _deep_ep_moe(
+def _test_deepep_deepgemm_moe(
     pgi: ProcessGroupInfo,
     dp_size: int,
     config: TestConfig,
@@ -302,7 +385,7 @@ def _deep_ep_moe(
         w1_scale_ep = w1_scale[e_start:e_end]
         w2_scale_ep = w2_scale[e_start:e_end]
 
-        deepep_moe = deep_ep_moe_impl(
+        deepep_moe = deepep_deepgemm_moe_impl(
             pg,
             pgi,
             dp_size,
@@ -311,7 +394,6 @@ def _deep_ep_moe(
             w2_ep,
             w1_scale_ep,
             w2_scale_ep,
-            config.num_experts,
         )
 
     torch.testing.assert_close(
@@ -335,15 +417,21 @@ MNKs = [
     (222, 1024, 2048),
 ]
 
+TOPKS = [2, 6]
+NUM_EXPERTS = [32]
+
 
 @pytest.mark.parametrize("mnk", MNKs)
-@pytest.mark.parametrize("num_experts", [32])
-@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOPKS)
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
 @requires_deep_ep
 @requires_deep_gemm
-def test_deep_ep_moe(mnk: tuple[int, int, int], num_experts: int, topk: int,
-                     world_dp_size: tuple[int, int]):
+def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
+                                topk: int, world_dp_size: tuple[int, int]):
+    """
+    Tests for High-Throughput DeepEP + DeepGemm integration.
+    """
 
     m, n, k = mnk
     current_platform.seed_everything(7)
@@ -354,6 +442,58 @@ def test_deep_ep_moe(mnk: tuple[int, int, int], num_experts: int, topk: int,
     block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
     block_size = [block_m, block_m]
 
+    world_size, dp_size = world_dp_size
+    config = TestConfig(topk=topk,
+                        m=m,
+                        k=k,
+                        n=n,
+                        num_experts=num_experts,
+                        block_size=block_size,
+                        low_latency=False,
+                        use_fp8_dispatch=None)
+
+    w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
+        num_experts, n, k, block_size)
+
+    parallel_launch(world_size, _test_deepep_deepgemm_moe, dp_size, config, w1,
+                    w2, w1_scale, w2_scale)
+
+
+MNKs = [
+    (1, 128, 2560),
+    (2, 128, 2560),
+    (3, 1024, 2560),
+    (32, 128, 2560),
+    (45, 512, 2560),
+    (64, 1024, 2560),
+    (222, 1024, 2560),
+]
+# Fix tests for USE_FP8_DISPATCH=True
+USE_FP8_DISPATCH = [False]
+
+
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOPKS)
+@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
+@pytest.mark.parametrize("block_size", [[128, 128]])
+@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@requires_deep_ep
+@requires_deep_gemm
+def test_ll_deepep_deepgemm_moe(mnk: tuple[int, int,
+                                           int], num_experts: int, topk: int,
+                                use_fp8_dispatch: bool, block_size: list[int],
+                                world_dp_size: tuple[int, int]):
+    """
+    Tests for Low-Latency DeepEP + DeepGemm integration.
+    """
+
+    m, n, k = mnk
+    current_platform.seed_everything(7)
+
+    if topk > num_experts:
+        pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
+
     world_size, dp_size = world_dp_size
     config = TestConfig(
         topk=topk,
@@ -362,10 +502,12 @@ def test_deep_ep_moe(mnk: tuple[int, int, int], num_experts: int, topk: int,
         n=n,
         num_experts=num_experts,
         block_size=block_size,
+        low_latency=True,
+        use_fp8_dispatch=use_fp8_dispatch,
     )
 
     w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
         num_experts, n, k, block_size)
 
-    parallel_launch(world_size, _deep_ep_moe, dp_size, config, w1, w2,
-                    w1_scale, w2_scale)
+    parallel_launch(world_size, _test_deepep_deepgemm_moe, dp_size, config, w1,
+                    w2, w1_scale, w2_scale)
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
new file mode 100644
index 000000000..a541d46b1
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+import importlib.util
+from typing import Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.utils import (
+    _resize_cache, per_token_group_quant_fp8)
+
+logger = init_logger(__name__)
+
+has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
+
+
+class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    # The Deep Gemm kernels only support block size of 128
+    DEEPGEMM_BLOCK_SHAPE = 128
+
+    def __init__(self, max_num_tokens: int, world_size: int, dp_size: int,
+                 block_shape: list[int]):
+        """
+        max_num_tokens: Maximum number of tokens from a DP Rank
+        world_size: Number of EP ranks
+        dp_size: Number of data-parallel ranks
+        block_shape: Block quantization block shape
+        """
+        super().__init__()
+        self.max_num_tokens = max_num_tokens
+        self.world_size = world_size
+        self.dp_size = dp_size
+        self.block_shape = block_shape
+
+        assert (len(self.block_shape) == 2 and all(
+            [v == self.DEEPGEMM_BLOCK_SHAPE for v in self.block_shape]))
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        num_experts: int,
+    ) -> tuple[int, int, torch.dtype]:
+        assert a.dim() == 2
+        num_dp = self.world_size // self.dp_size
+        max_num_tokens = a.size(
+            0) if self.max_num_tokens is None else self.max_num_tokens
+        workspace13 = num_experts * max_num_tokens * num_dp * max(K, N)
+        workspace2 = num_experts * max_num_tokens * num_dp * (N // 2)
+        return (workspace13, workspace2, a.dtype)
+
+    def apply(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_num_tokens: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        import deep_gemm as dg
+        assert hidden_states.ndim == 3
+
+        a1q = hidden_states
+        _, N, K = w1.size()
+
+        if global_num_experts == -1:
+            global_num_experts = w1.size(0)
+
+        assert w2.size(1) == K
+
+        E, max_num_tokens, N, K, top_k_num = mk._moe_problem_size(
+            hidden_states, w1, w2, topk_ids)
+
+        workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N))
+        workspace2 = _resize_cache(workspace2, (E, max_num_tokens, N // 2))
+        workspace3 = _resize_cache(workspace13, (E, max_num_tokens, K))
+
+        # (from deepgemm docs) : A value hint (which is a value on CPU)
+        # for the M expectation of each batch, correctly setting this value
+        # may lead to better performance.
+        expected_m = max_num_tokens
+
+        dg.m_grouped_gemm_fp8_fp8_bf16_nt_masked((a1q, a1q_scale),
+                                                 (w1, w1_scale),
+                                                 out=workspace1,
+                                                 masked_m=expert_num_tokens,
+                                                 expected_m=expected_m)
+
+        # TODO (varun) [Optimization]: Use a batched version of activation.
+        # Similarly for the quant below.
+        self.activation(activation, workspace2, workspace1.view(-1, N))
+
+        w2_hidden_size = workspace2.size(-1)
+        workspace2 = workspace2.view(-1, w2_hidden_size)
+
+        a2q_scale: Optional[torch.Tensor] = None
+        a2q, a2q_scale = per_token_group_quant_fp8(workspace2,
+                                                   self.block_shape[1],
+                                                   column_major_scales=False)
+        a2q = a2q.view(E, max_num_tokens, -1)
+        a2q_scale = a2q_scale.view(E, max_num_tokens, -1)
+
+        dg.m_grouped_gemm_fp8_fp8_bf16_nt_masked((a2q, a2q_scale),
+                                                 (w2, w2_scale),
+                                                 out=workspace3,
+                                                 masked_m=expert_num_tokens,
+                                                 expected_m=expected_m)
+
+        return workspace3
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
new file mode 100644
index 000000000..4db6b84e9
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+    BatchedDeepGemmExperts)
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedTritonExperts)
+
+
+class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(self,
+                 max_num_tokens: int,
+                 world_size: int,
+                 dp_size: int,
+                 use_fp8_w8a8: bool = False,
+                 use_int8_w8a8: bool = False,
+                 use_int8_w8a16: bool = False,
+                 use_int4_w4a16: bool = False,
+                 per_channel_quant: bool = False,
+                 block_shape: Optional[list[int]] = None,
+                 allow_deep_gemm: bool = False):
+        super().__init__()
+        assert not use_int8_w8a8, "NYI"
+        assert not use_int8_w8a16, "NYI"
+        assert not use_int4_w4a16, "NYI"
+
+        self.max_num_tokens = max_num_tokens
+        self.world_size = world_size
+        self.dp_size = dp_size
+        self.use_fp8_w8a8 = use_fp8_w8a8
+        self.use_int8_w8a8 = use_int8_w8a8
+        self.use_int8_w8a16 = use_int8_w8a16
+        self.use_int4_w4a16 = use_int4_w4a16
+        self.per_channel_quant = per_channel_quant
+        self.block_shape = block_shape
+        self.allow_deep_gemm = allow_deep_gemm
+
+        # BatchedTritonKernel doesn't support block quantization
+        # at the moment.
+        self.batched_triton_experts = BatchedTritonExperts(
+            max_num_tokens=self.max_num_tokens,
+            use_fp8_w8a8=self.use_fp8_w8a8,
+            use_int8_w8a8=self.use_int8_w8a8,
+            use_int8_w8a16=self.use_int8_w8a16,
+            use_int4_w4a16=self.use_int4_w4a16,
+            per_channel_quant=self.per_channel_quant,
+            block_shape=self.block_shape,
+            world_size=self.world_size,
+            dp_size=self.dp_size) if self.block_shape is None else None
+
+        is_fp8_128_block_quantized = (self.use_fp8_w8a8
+                                      and self.block_shape is not None
+                                      and len(self.block_shape) == 2 and all(
+                                          [b == 128
+                                           for b in self.block_shape]))
+        self.batched_deep_gemm_experts = BatchedDeepGemmExperts(
+            max_num_tokens=self.max_num_tokens,
+            world_size=self.world_size,
+            dp_size=self.dp_size,
+            block_shape=self.block_shape,  # type: ignore[arg-type]
+        ) if (self.allow_deep_gemm and is_fp8_128_block_quantized) else None
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        num_experts: int,
+    ) -> tuple[int, int, torch.dtype]:
+        # Note: the deep gemm workspaces are strictly larger than the triton
+        # workspaces so we can be pessimistic here and allocate for DeepGemm
+        # even if we fall back to triton later, e.g. if expert maps are set.
+        if self.allow_deep_gemm and self.batched_deep_gemm_experts is not None:
+            return self.batched_deep_gemm_experts.workspace_shapes(
+                a, M, N, K, topk, num_experts)
+        else:
+            assert self.batched_triton_experts is not None
+            return self.batched_triton_experts.workspace_shapes(
+                a, M, N, K, topk, num_experts)
+
+    def apply(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_num_tokens: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        use_batched_deep_gemm_experts = (self.allow_deep_gemm
+                                         and self.batched_deep_gemm_experts
+                                         is not None)
+        experts = (self.batched_deep_gemm_experts
+                   if use_batched_deep_gemm_experts else
+                   self.batched_triton_experts)
+        assert experts is not None
+        return experts.apply(hidden_states, w1, w2, topk_ids, activation,
+                             global_num_experts, expert_map, w1_scale,
+                             w2_scale, w1_zp, w2_zp, a1q_scale, a2_scale,
+                             workspace13, workspace2, expert_num_tokens)
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index b9d817a14..3484a7a8a 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import Optional
+from typing import Optional, Union
 
 import deep_ep
 import torch
@@ -65,6 +65,54 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def topk_indices_dtype(self) -> Optional[torch.dtype]:
         return torch.int64
 
+    def _do_quant(
+            self, x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+            a1_scale: Optional[torch.Tensor], a2_scale: Optional[torch.Tensor],
+            a1_dtype: torch.dtype
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+
+        block_k = self.block_shape[1] if self.block_shape is not None else None
+        if self.use_fp8_dispatch:
+            if block_k == DEEPEP_QUANT_BLOCK_SIZE:
+                # DeepEP kernels did the quantization for us.
+                x, x_scales = x
+                return x, x_scales
+
+            # Dequant to get back the tokens in the datatype we dispatched in.
+            x_fp8, x_scales = x
+            x = dequant_fp8(x_fp8, x_scales).to(dtype=a1_dtype)
+
+        assert isinstance(x, torch.Tensor)
+
+        # Check if there is a block_shape / or if we can infer the quantization
+        # schemes from the scales.
+        per_token_quant = None
+        if all([v is None for v in [self.block_shape, a1_scale, a2_scale]
+                ]) and self.quant_dtype is not None:
+            # Quantization required despite none of the inputs suggesting
+            # quantization. Fallback to per_token_dynamic quant.
+            per_token_quant = True
+        else:
+            per_token_quant = ((self.block_shape is not None) or
+                               (a1_scale is not None and a1_scale.numel() != 1)
+                               or (a2_scale is not None
+                                   and a2_scale.numel() != 1))
+
+        num_experts, max_tokens, hidden_dim = x.size()
+
+        # TODO (varun): Optimization - Use a batched version of quant
+        x = x.view((-1, hidden_dim))
+        x, x_scales = moe_kernel_quantize_input(x, a1_scale, self.quant_dtype,
+                                                per_token_quant,
+                                                self.block_shape)
+        x = x.view((num_experts, -1, hidden_dim))
+
+        if per_token_quant:
+            assert x_scales is not None
+            x_scales = x_scales.view(num_experts, max_tokens, -1)
+
+        return x, x_scales
+
     def prepare(
         self,
         a1: torch.Tensor,
@@ -87,11 +135,11 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             assert hidden_size % 128 == 0, \
             "DeepEP kernels quantize the inputs in blocks of shape 128"
 
-        # Quantize
-        per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
+        has_per_token_scales = a1_scale.numel(
+        ) != 1 if a1_scale is not None else (
             a2_scale.numel() != 1 if a2_scale is not None else False)
-        assert not per_act_token, (
-            "low_latency kernels don't support per-act-token quant")
+        assert not has_per_token_scales, (
+            "low_latency kernels doesn't support dispatching per-token scales")
 
         if apply_router_weight_on_input:
             topk = rank_topk_ids.size(1)
@@ -110,22 +158,8 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                                                 async_finish=False,
                                                 return_recv_hook=False)
 
-        if self.use_fp8_dispatch:
-            # TODO (varun) : In the case of dynamic quantization, we could
-            # probably skip the quant below and use the results directly.
-            # Although note that the deepep quant is per token 128 elements.
-            expert_x_fp8, expert_x_scales = expert_x
-            expert_x = dequant_fp8(expert_x_fp8,
-                                   expert_x_scales).to(dtype=a1.dtype)
-
-        num_experts = expert_x.size(0)
-        hidden_dim = expert_x.size(-1)
-
-        expert_x = expert_x.view((-1, expert_x.size(-1)))
-        expert_x, expert_x_scale = moe_kernel_quantize_input(
-            expert_x, a1_scale, self.quant_dtype, per_act_token,
-            self.block_shape)
-        expert_x = expert_x.view((num_experts, -1, hidden_dim))
+        expert_x, expert_x_scale = self._do_quant(expert_x, a1_scale, a2_scale,
+                                                  a1.dtype)
 
         return (expert_x, expert_x_scale, expert_num_tokens, None, None)
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 2438ec30b..5ac22b6a0 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -771,21 +771,21 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
     def select_gemm_impl(self, prepare_finalize):
 
-        from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-            BatchedTritonExperts)
+        from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
+            BatchedTritonOrDeepGemmExperts)
         from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
             TritonOrDeepGemmExperts)
 
         assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
             "Marlin and ROCm AITER are not supported with all2all yet.")
 
-        experts: Optional[Union[BatchedTritonExperts,
+        experts: Optional[Union[BatchedTritonOrDeepGemmExperts,
                                 TritonOrDeepGemmExperts]] = None
         max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
         use_batched_experts = max_num_tokens_per_rank is not None
 
         if use_batched_experts:
-            experts = BatchedTritonExperts(
+            experts = BatchedTritonOrDeepGemmExperts(
                 max_num_tokens=max_num_tokens_per_rank,
                 world_size=prepare_finalize.world_size,
                 dp_size=prepare_finalize.dp_size,
@@ -793,7 +793,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 use_int8_w8a8=False,
                 use_int8_w8a16=False,
                 use_int4_w4a16=False,
-                block_shape=None,
+                per_channel_quant=False,
+                block_shape=self.quant_config.weight_block_size,
+                allow_deep_gemm=self.allow_deep_gemm,
             )
         else:
             experts = TritonOrDeepGemmExperts(
-- 
GitLab


From 23027e2daff4e0ef81baf4403b3d9eb452491b38 Mon Sep 17 00:00:00 2001
From: CYJiang <86391540+googs1025@users.noreply.github.com>
Date: Thu, 5 Jun 2025 06:37:25 +0800
Subject: [PATCH 177/274] [Misc] refactor: simplify
 EngineCoreClient.make_async_mp_client in AsyncLLM (#18817)

Signed-off-by: googs1025 <googs1025@gmail.com>
---
 vllm/v1/engine/async_llm.py   | 14 +++-----------
 vllm/v1/engine/core_client.py | 25 +++++++++++++++++++------
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 61ea3c4c3..089f15aee 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -28,8 +28,7 @@ from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Device, cdiv
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.core_client import (AsyncMPClient, DPAsyncMPClient,
-                                        RayDPClient)
+from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 from vllm.v1.engine.output_processor import (OutputProcessor,
                                              RequestOutputCollector)
@@ -121,15 +120,8 @@ class AsyncLLM(EngineClient):
                                                 log_stats=self.log_stats)
 
         # EngineCore (starts the engine in background process).
-        core_client_class: type[AsyncMPClient]
-        if vllm_config.parallel_config.data_parallel_size == 1:
-            core_client_class = AsyncMPClient
-        elif vllm_config.parallel_config.data_parallel_backend == "ray":
-            core_client_class = RayDPClient
-        else:
-            core_client_class = DPAsyncMPClient
-
-        self.engine_core = core_client_class(
+
+        self.engine_core = EngineCoreClient.make_async_mp_client(
             vllm_config=vllm_config,
             executor_class=executor_class,
             log_stats=self.log_stats,
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 0cd58d01d..d1b0b300d 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -68,18 +68,31 @@ class EngineCoreClient(ABC):
                 "is not currently supported.")
 
         if multiprocess_mode and asyncio_mode:
-            if vllm_config.parallel_config.data_parallel_size > 1:
-                if vllm_config.parallel_config.data_parallel_backend == "ray":
-                    return RayDPClient(vllm_config, executor_class, log_stats)
-                return DPAsyncMPClient(vllm_config, executor_class, log_stats)
-
-            return AsyncMPClient(vllm_config, executor_class, log_stats)
+            return EngineCoreClient.make_async_mp_client(
+                vllm_config, executor_class, log_stats)
 
         if multiprocess_mode and not asyncio_mode:
             return SyncMPClient(vllm_config, executor_class, log_stats)
 
         return InprocClient(vllm_config, executor_class, log_stats)
 
+    @staticmethod
+    def make_async_mp_client(
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_addresses: Optional[dict[str, str]] = None,
+        client_index: int = 0,
+    ) -> "MPClient":
+        if vllm_config.parallel_config.data_parallel_size > 1:
+            if vllm_config.parallel_config.data_parallel_backend == "ray":
+                return RayDPClient(vllm_config, executor_class, log_stats,
+                                   client_addresses, client_index)
+            return DPAsyncMPClient(vllm_config, executor_class, log_stats,
+                                   client_addresses, client_index)
+        return AsyncMPClient(vllm_config, executor_class, log_stats,
+                             client_addresses, client_index)
+
     @abstractmethod
     def shutdown(self):
         ...
-- 
GitLab


From b2fac67130b1cff07b175c4859c297c746e06601 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 5 Jun 2025 01:25:34 +0200
Subject: [PATCH 178/274] [P/D] Heterogeneous TP (#18833)

Signed-off-by: nicklucche <nlucches@redhat.com>
---
 .../nixl_integration/run_accuracy_test.sh     |  11 +-
 .../nixl_integration/test_accuracy.py         |   1 +
 .../kv_transfer/kv_connector/utils.py         |  18 +-
 .../kv_connector/v1/nixl_connector.py         | 338 +++++++++++++-----
 vllm/v1/attention/backends/flash_attn.py      |  16 +
 vllm/worker/worker_base.py                    |   3 +-
 6 files changed, 287 insertions(+), 100 deletions(-)

diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index c17784e0a..b48655d80 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -8,7 +8,9 @@ MODELS=(
 
 # Number of prefill and decode instances to create
 NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
-NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-2}   # Default to 2
+NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
+PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
+DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
 
 # Find the git repository root directory
 GIT_ROOT=$(git rev-parse --show-toplevel)
@@ -74,9 +76,10 @@ run_tests_for_model() {
   for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
     # Calculate GPU ID - we'll distribute across available GPUs
     GPU_ID=$((i % $(get_num_gpus)))
+
     # Calculate port number (base port + instance number)
     PORT=$((8100 + i))
-    # Calculate side channel port
+    # Calculate side channel port. Avoid clash with with TP workers. 
     SIDE_CHANNEL_PORT=$((5559 + i))
 
     echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
@@ -87,6 +90,7 @@ run_tests_for_model() {
     --enforce-eager \
     --disable-log-requests \
     --gpu-memory-utilization 0.2 \
+    --tensor-parallel-size $PREFILLER_TP_SIZE \
     --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
 
     if [ -n "$model_args" ]; then
@@ -109,7 +113,7 @@ run_tests_for_model() {
     # Calculate port number (base port + instance number)
     PORT=$((8200 + i))
     # Calculate side channel port
-    SIDE_CHANNEL_PORT=$((5659 + i))
+    SIDE_CHANNEL_PORT=$((5659 + i * $DECODER_TP_SIZE))
 
     echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"
 
@@ -119,6 +123,7 @@ run_tests_for_model() {
     --enforce-eager \
     --disable-log-requests \
     --gpu-memory-utilization 0.2 \
+    --tensor-parallel-size $DECODER_TP_SIZE \
     --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
 
     if [ -n "$model_args" ]; then
diff --git a/tests/v1/kv_connector/nixl_integration/test_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
index 2b2b147ce..e5d66ffee 100644
--- a/tests/v1/kv_connector/nixl_integration/test_accuracy.py
+++ b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
@@ -14,6 +14,7 @@ RTOL = 0.03
 # Model-specific expected values
 EXPECTED_VALUES = {
     "Qwen/Qwen3-0.6B": 0.41,
+    "deepseek-ai/deepseek-vl2-small": 0.59
 }
 
 SIMPLE_PROMPT = "The best part about working on vLLM is that I got to meet so many people across various different organizations like UCB, Google, and Meta which means",  # noqa: E501
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index c62444e75..b9bed06d7 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -3,11 +3,12 @@
 """
 KV cache helper for store.
 """
+
 import torch
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -90,3 +91,18 @@ class model_aware_kv_ops_helper:
                 layer.self_attn.attn._k_scale,
                 layer.self_attn.attn._v_scale,
             )
+
+
+def get_kv_connector_cache_layout():
+    vllm_config = get_current_vllm_config()
+    kv_config = vllm_config.kv_transfer_config
+    if vllm_config.model_config is None:
+        logger.warning("Unable to detect current VLLM config. " \
+        "Defaulting to NHD kv cache layout.")
+    else:
+        use_mla = vllm_config.model_config.use_mla
+        if not use_mla and kv_config.kv_connector == "NixlConnector":
+            logger.info("NixlConnector detected. Setting KV cache " \
+            "layout to HND for better xfer performance.")
+            return "HND"
+    return "NHD"
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index fd2228012..400fa29c9 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.request import Request
 
+Transfer = tuple[int, float]  # (xfer_handle, start_time)
 GET_META_MSG = b"get_meta_msg"
 
 logger = init_logger(__name__)
@@ -54,6 +55,8 @@ class NixlAgentMetadata(
     agent_metadata: bytes
     kv_caches_base_addr: list[int]
     num_blocks: int
+    tp_size: int
+    block_len: int
 
 
 @dataclass
@@ -331,10 +334,14 @@ class NixlConnectorWorker:
         logger.info("Initializing NIXL wrapper")
         logger.info("Initializing NIXL worker %s", engine_id)
 
+        # Config.
+        self.vllm_config = vllm_config
+        self.block_size = vllm_config.cache_config.block_size
+
         # Agent.
         self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), None)
-        # Map of engine_id -> agent_name.
-        self._remote_agents: dict[str, str] = {}
+        # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}.
+        self._remote_agents: dict[str, dict[int, str]] = defaultdict(dict)
 
         # NIXL handshake port.
         # NOTE(rob): Within a DP group, each DP rank gets its own
@@ -354,7 +361,8 @@ class NixlConnectorWorker:
         # KV Caches and nixl tracking data.
         self.kv_caches: dict[str, torch.Tensor] = {}
 
-        # Map of engine_id -> kv_caches_base_addr
+        # Map of engine_id -> kv_caches_base_addr. For TP case, each local
+        # rank will still only pull from a single remote TP worker.
         self.kv_caches_base_addr: dict[str, list[int]] = {}
 
         # Number of NIXL regions. Currently one region per cache
@@ -362,19 +370,19 @@ class NixlConnectorWorker:
         self.num_regions = 0
         self.num_layers = 0
 
-        # nixl_prepped_dlist_handle (int).
+        # nixl_prepped_dlist_handle.
         self.src_xfer_side_handle: int = 0
         # Map of engine_id -> nixl_prepped_dlist_handle (int)].
         self.dst_xfer_side_handles: dict[str, int] = {}
 
-        # Map of engine_id -> num_blocks.
+        # Map of engine_id -> num_blocks. All ranks in the same deployment will
+        # have the same number of blocks.
         self.dst_num_blocks: dict[str, int] = {}
         self._registered_descs: list[Any] = []
 
         # In progress transfers.
         # [req_id -> list[handle]]
-        self._recving_transfers: defaultdict[str, list[Any]] = defaultdict(
-            list[Any])
+        self._recving_transfers = defaultdict[str, list[Transfer]](list)
 
         # Complete transfer tracker. Used by the rank 0 to track finished
         # transactions on ranks 1 to N-1.
@@ -395,6 +403,11 @@ class NixlConnectorWorker:
         # List of block window sizes for each layer for local attention
         self.block_window_per_layer: list[Optional[int]] = []
 
+        self._tp_size: dict[str, int] = {self.engine_id: self.world_size}
+        # With heterogeneous TP, P must wait for all assigned D TP workers to
+        # finish reading before safely freeing the blocks.
+        self.consumer_notification_counts_by_req = defaultdict[str, int](int)
+
     @staticmethod
     def _nixl_handshake_listener(metadata: NixlAgentMetadata,
                                  ready_event: threading.Event, base_port: int,
@@ -426,27 +439,44 @@ class NixlConnectorWorker:
         """Do a NIXL handshake with a remote instance."""
 
         start_time = time.perf_counter()
-        # NOTE(rob): we need each tp_rank to have a unique port.
-        # This is a hack to keep us moving. We will switch when
-        # we switch to HTTP-based NIXL metadata exchange.
-        path = make_zmq_path("tcp", host, port + self.tp_rank)
-        logger.debug("Querying metadata on path: %s", path)
-        with zmq_ctx(zmq.REQ, path) as sock:
-            # Send query for the request.
-            sock.send(GET_META_MSG)
-            metadata_bytes = sock.recv()
-            decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
-            metadata = decoder.decode(metadata_bytes)
-            got_metadata_time = time.perf_counter()
 
-            # Register Remote agent.
-            self.add_remote_agent(metadata)
-            setup_agent_time = time.perf_counter()
+        # NOTE(rob): we need each rank to have a unique port. This is
+        # a hack to keep us moving. We will switch when moving to etcd
+        # or where we have a single ZMQ socket in the scheduler.
 
-            logger.debug("NIXL handshake: get metadata took: %s",
-                         got_metadata_time - start_time)
-            logger.debug("NIXL handshake: add agent took: %s",
-                         setup_agent_time - got_metadata_time)
+        def handshake(path: str, rank: int) -> NixlAgentMetadata:
+            # Send query for the request.
+            with zmq_ctx(zmq.REQ, path) as sock:
+                sock.send(GET_META_MSG)
+                metadata_bytes = sock.recv()
+                decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
+                metadata = decoder.decode(metadata_bytes)
+                got_metadata_time = time.perf_counter()
+
+                # Register Remote agent.
+                self.add_remote_agent(metadata, rank)
+                setup_agent_time = time.perf_counter()
+
+                logger.debug("NIXL handshake: get metadata took: %s",
+                             got_metadata_time - start_time)
+                logger.debug("NIXL handshake: add agent took: %s",
+                             setup_agent_time - got_metadata_time)
+                return metadata
+
+        # Handshake with remote agent-rank0 first to get the tp_size of remote
+        path = make_zmq_path("tcp", host, port)
+        logger.debug("Querying master rank metadata on path: %s", path)
+        metadata = handshake(path, 0)
+
+        # Handshake only with the other TP remote the current local rank will
+        # pull from. With homogeneous TP it happens to be the same rank_i.
+        tp_ratio = self._tp_size[self.engine_id] // metadata.tp_size
+        p_remote_rank = self.tp_rank // tp_ratio
+        if p_remote_rank > 0:
+            path = make_zmq_path("tcp", host, port + p_remote_rank)
+            logger.debug("Querying metadata on path: %s at remote rank %s",
+                         path, p_remote_rank)
+            _ = handshake(path, p_remote_rank)
 
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """Register the KV Cache data in nixl."""
@@ -455,24 +485,34 @@ class NixlConnectorWorker:
         kv_elem_size = first_kv_cache.element_size()
 
         # TODO(tms): Find a more robust way to detect and handle MLA
-        use_mla = len(first_kv_cache.shape) == 3
-        if use_mla:
+        self.use_mla = len(first_kv_cache.shape) == 3
+        # NOTE (NickLucche) To move blocks efficiently with NIXL, the expected
+        # KV memory layout is HND, as opposed to the default NHD. Note that it
+        # will only affects the strides. For MLA instead, we make require no
+        # such thing and resort to the standard layout.
+        if self.use_mla:
             # MLA case.
             self.num_blocks = first_kv_cache.shape[0]
             block_rank = 2  # [block_size, latent_dim]
             block_shape = first_kv_cache.shape[-block_rank:]
+            block_size, kv_latent_dim = block_shape
+            self.slot_size_bytes = kv_elem_size * kv_latent_dim
         else:
-            # [2 (k and v), num_blocks, ...]
+            # [2 (k and v), num_blocks, block_size, kv_heads, head_dim]
             self.num_blocks = first_kv_cache.shape[1]
             block_rank = 3  # [block_size, kv_heads, head_dim]
             block_shape = first_kv_cache.shape[-block_rank:]
-
+            block_size, n_kv_heads, head_dim = block_shape
+            # head size in bytes.
+            self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
+        assert block_size == self.block_size
         # TODO(tms): self.block_len needs to be per-layer for sliding window,
         # hybrid attn, etc
+        # block size in bytes
         self.block_len = kv_elem_size * math.prod(block_shape)
 
-        logger.debug("Registering KV_Caches. use_mla: %s, shape %s", use_mla,
-                     first_kv_cache.shape)
+        logger.debug("Registering KV_Caches. use_mla: %s, shape %s",
+                     self.use_mla, first_kv_cache.shape)
         logger.debug("num_blocks: %s, block_shape: %s", self.num_blocks,
                      block_shape)
         logger.debug("Per layer kv cache size: %s", first_kv_cache.shape)
@@ -489,7 +529,7 @@ class NixlConnectorWorker:
         # (roughly 8KB vs 5KB).
         for cache_or_caches in kv_caches.values():
             # Normalize to always be a list of caches
-            cache_list = [cache_or_caches] if use_mla else cache_or_caches
+            cache_list = [cache_or_caches] if self.use_mla else cache_or_caches
             for cache in cache_list:
                 base_addr = cache.data_ptr()
                 region_len = self.num_blocks * self.block_len
@@ -524,16 +564,37 @@ class NixlConnectorWorker:
         logger.debug("Registering descs: %s", caches_data)
         self.nixl_wrapper.register_memory(descs)
         logger.debug("Done registering descs")
-
         self._registered_descs.append(descs)
 
+        # Register local/src descr for NIXL xfer.
+        blocks_data = []
+        for base_addr in self.kv_caches_base_addr[self.engine_id]:
+            # NOTE With heter-TP, more blocks are prepared than what are
+            # needed as self.num_blocks >= nixl_agent_meta.num_blocks. We
+            # could create fewer, but then _get_block_descs_ids needs to
+            # select agent_meta.num_blocks instead of self.num_blocks for
+            # local descr, and that makes handling regular flow less clean.
+            for block_id in range(self.num_blocks):
+                block_offset = block_id * self.block_len
+                addr = base_addr + block_offset
+                # (addr, len, device id)
+                blocks_data.append((addr, self.block_len, self.tp_rank))
+        logger.debug("Created %s blocks for src engine %s and rank %s",
+                     len(blocks_data), self.engine_id, self.tp_rank)
+
+        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
+        # NIXL_INIT_AGENT to be used for preparations of local descs.
+        self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
+            "NIXL_INIT_AGENT", descs)
+
         # After KV Caches registered, listen for new connections.
         metadata = NixlAgentMetadata(
             engine_id=self.engine_id,
             agent_metadata=self.nixl_wrapper.get_agent_metadata(),
             kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
             num_blocks=self.num_blocks,
-        )
+            tp_size=self.world_size,
+            block_len=self.block_len)
         ready_event = threading.Event()
         self._nixl_handshake_listener_t = threading.Thread(
             target=self._nixl_handshake_listener,
@@ -543,50 +604,123 @@ class NixlConnectorWorker:
         self._nixl_handshake_listener_t.start()
         ready_event.wait()
 
-    def add_remote_agent(self, nixl_agent_meta: NixlAgentMetadata):
+    def add_remote_agent(self,
+                         nixl_agent_meta: NixlAgentMetadata,
+                         remote_tp_rank: int = 0):
+        """
+        Add the remote NIXL agent and prepare the descriptors for reading cache
+        blocks from remote.
+
+        In particular, handle both homogeneous and heterogeneous TP. The former
+        requires local rank_i to read from remote rank_i. 
+        The latter, assuming D.world_size > P.world_size, requires that two or 
+        more local TP worker share the xfer from a single TP worker.
+
+        Here's an example:
+
+        rank_offset     p_remote_tp_rank
+        (kv split no)    
+        --------------------------------
+            0                 0      Worker0  ---- 1st half of KV ----> Worker0  [ KV Cache ]
+                                                                        /
+            1                 0      Worker1  ---- 2nd half of KV -----/
+
+            0                 1      Worker2  ---- 1st half of KV ----> Worker1  [ KV Cache ]
+                                                                        /
+            1                 1      Worker3  ---- 2nd half of KV -----/
+
+
+                                Decoder TP workers                     Prefix TP workers
+                                  (world_size=4)                         (world_size=2)
+                                                 tp_ratio = 4 // 2 = 2                  
+                                
+        Considering the KV Caches, if P-Worker_i has cache size [2, num_blocksP, kv_heads, block_size, head_dim]  
+        then D-Worker_j has [2, num_blocksD, kv_heads//tp_ratio, block_size, head_dim]. Mind the "HND" layout format.
+        Assuming num_blocksD >= num_blocksP, D-Worker0 reads from P-Worker0 by preparing the kv_heads//tp_ratio 
+        first heads from all the slots of all the blocks. D-Worker1 will do the same, but reading the second split
+        along the kv_heads dimension, and so forth until "tp_ratio" D TP workers have pulled from P-Worker0.   
+        
+        Note that the above will also hold true for the homogeneous TP case, where tp_ratio evaluates to 1.
+
+        Regarding MLA case, the cache is replicated across TP workers so the rank_offset will just always be 0
+        so that the whole cache is shared by "tp_ratio" D TP workers.
+        """ # noqa: E501
         engine_id = nixl_agent_meta.engine_id
-        assert engine_id != self.engine_id, "Conflict engine id found!"
-        if engine_id in self._remote_agents:
+        # TODO re-evaluate refreshing for scaling/recovery
+        if remote_tp_rank in self._remote_agents.get(engine_id, ()):
             return
 
-        self._remote_agents[engine_id] = self.nixl_wrapper.add_remote_agent(
-            nixl_agent_meta.agent_metadata)
-        self.kv_caches_base_addr[
-            engine_id] = nixl_agent_meta.kv_caches_base_addr
+        if engine_id in self._tp_size:
+            assert self._tp_size[engine_id] == nixl_agent_meta.tp_size
+        else:
+            self._tp_size[engine_id] = nixl_agent_meta.tp_size
+        self._remote_agents[engine_id][
+            remote_tp_rank] = self.nixl_wrapper.add_remote_agent(
+                nixl_agent_meta.agent_metadata)
+
+        # Number of D TP workers reading from a single P TP worker. This is
+        # 1 when P and D `--tensor-parallel-size` match.
+        assert self._tp_size[self.engine_id] % self._tp_size[engine_id] == 0, \
+        "Local TP size must be divisible by remote TP size."
+        tp_ratio = self._tp_size[self.engine_id] // self._tp_size[engine_id]
+        assert tp_ratio > 0, "Decode TP cannot be smaller than"
+        " prefill TP"
+        if self.use_mla:
+            # With MLA the only difference is in the number of blocks.
+            remote_block_size = nixl_agent_meta.block_len / (
+                self.slot_size_bytes)
+            assert self.block_len == nixl_agent_meta.block_len
+        else:
+            remote_block_size = nixl_agent_meta.block_len / (
+                self.slot_size_bytes * tp_ratio)
+
+            assert nixl_agent_meta.block_len == self.block_len * tp_ratio, \
+            "Remote P worker KV layer cache must be of shape [2, N, \
+            local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
 
-        # Create src descs and xfer side handles.
-        blocks_data = []
-        for base_addr in self.kv_caches_base_addr[self.engine_id]:
-            for block_id in range(self.num_blocks):
-                block_offset = block_id * self.block_len
-                # (addr, len, device id)
-                blocks_data.append(
-                    (base_addr + block_offset, self.block_len, self.tp_rank))
-        logger.debug("Created %s blocks for src engine %s and tp_rank %s",
-                     len(blocks_data), self.engine_id, self.tp_rank)
+        assert self.block_size == remote_block_size, "Remote P worker with \
+        different block size is not supported"
 
-        # Register with NIXL.
-        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
-        self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
-            "NIXL_INIT_AGENT", descs)
+        assert self.num_blocks >= nixl_agent_meta.num_blocks
+
+        # Create dst descs and xfer side handles. TP workers have same #blocks.
+        if engine_id in self.dst_num_blocks:
+            assert self.dst_num_blocks[engine_id] == nixl_agent_meta.num_blocks
+        else:
+            self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
 
-        # Create dst descs and xfer side handles.
-        self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
         blocks_data = []
-        for base_addr in self.kv_caches_base_addr[engine_id]:
-            for block_id in range(nixl_agent_meta.num_blocks):
-                block_offset = block_id * self.block_len
-                # (addr, len, device id)
-                blocks_data.append(
-                    (base_addr + block_offset, self.block_len, self.tp_rank))
-        logger.debug("Created %s blocks for dst engine %s and tp_rank %s",
-                     len(blocks_data), engine_id, self.tp_rank)
+        # With homogeneous TP, D pulls the whole kv cache from corresponding
+        # rank. With heterogeneous TP, prepare the descriptors by splitting the
+        # P KV cache along kv_head dim, of D worker's kv_head size (D>P).
+        # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..].
+        p_remote_tp_rank = self.tp_rank // tp_ratio
+        # Only register the remote's descriptors if current rank pulls from it.
+        if p_remote_tp_rank == remote_tp_rank:
+            self.kv_caches_base_addr[
+                engine_id] = nixl_agent_meta.kv_caches_base_addr
+            rank_offset = self.tp_rank % tp_ratio * self.block_len \
+                if not self.use_mla else 0
+            # Register all remote blocks, but only the corresponding kv heads.
+            for base_addr in nixl_agent_meta.kv_caches_base_addr:
+                for block_id in range(nixl_agent_meta.num_blocks):
+                    block_offset = block_id * nixl_agent_meta.block_len
+                    # For each block, grab the heads chunk belonging to rank_i
+                    # of size remote_nheads // tp_ratio, which correspond to
+                    # self.block_len == remote_block_len//tp_ratio bytes.
+                    addr = base_addr + block_offset + rank_offset
+                    # (addr, len, device id)
+                    blocks_data.append((addr, self.block_len, remote_tp_rank))
+            logger.debug(
+                "Created %s blocks for dst engine %s with remote rank %s and " \
+                "local rank %s",
+                len(blocks_data), engine_id, remote_tp_rank, self.tp_rank)
 
-        # Register with NIXL.
-        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
-        self.dst_xfer_side_handles[
-            engine_id] = self.nixl_wrapper.prep_xfer_dlist(
-                self._remote_agents[engine_id], descs)
+            # Register with NIXL.
+            descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
+            self.dst_xfer_side_handles[
+                engine_id] = self.nixl_wrapper.prep_xfer_dlist(
+                    self._remote_agents[engine_id][remote_tp_rank], descs)
 
     def get_finished(self) -> tuple[set[str], set[str]]:
         """
@@ -654,16 +788,25 @@ class NixlConnectorWorker:
             return done_sending, done_recving
 
     def _get_new_notifs(self) -> set[str]:
-        """Get req_ids which got a remote xfer message."""
-
+        """
+        Get req_ids which got a remote xfer message. When multiple consumers
+        are reading from the same producer (heterogeneous TP scenario), wait
+        for all consumers to be done pulling.
+        """
         notified_req_ids: set[str] = set()
-        for req_ids in self.nixl_wrapper.get_new_notifs().values():
-            for req_id in req_ids:
-                assert req_id not in notified_req_ids
-                notified_req_ids.add(req_id.decode("utf-8"))
+        for notifs in self.nixl_wrapper.get_new_notifs().values():
+            for notif in notifs:
+                req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1)
+                self.consumer_notification_counts_by_req[req_id] += 1
+                # Wait all consumers (D) to be done reading before freeing.
+                if self.consumer_notification_counts_by_req[req_id] == int(
+                        tp_ratio):
+                    notified_req_ids.add(req_id)
+                    del self.consumer_notification_counts_by_req[req_id]
         return notified_req_ids
 
-    def _pop_done_transfers(self, transfers: dict[str, list[int]]) -> set[str]:
+    def _pop_done_transfers(
+            self, transfers: dict[str, list[tuple[int, float]]]) -> set[str]:
         """
         Pop completed xfers by checking for DONE state.
         Args:
@@ -673,23 +816,17 @@ class NixlConnectorWorker:
         """
         done_req_ids: set[str] = set()
         for req_id, handles in list(transfers.items()):
-            running_reqs = []
-            for handle in handles:
+            for handle, xfer_stime in handles:
                 xfer_state = self.nixl_wrapper.check_xfer_state(handle)
                 if xfer_state == "DONE":
-                    # TODO ptarasiewicz: why abort is throwing errors?
-                    # self.nixl_wrapper.release_xfer_handle(handle)
+                    self.nixl_wrapper.release_xfer_handle(handle)
+                    done_req_ids.add(req_id)
+                    del transfers[req_id]
+                elif xfer_state == "PROC":
                     continue
-                if xfer_state == "PROC":
-                    running_reqs.append(handle)
                 else:
                     raise RuntimeError("Transfer failed with state %s",
                                        xfer_state)
-            if len(running_reqs) == 0:
-                done_req_ids.add(req_id)
-                del transfers[req_id]
-            else:
-                transfers[req_id] = running_reqs
         return done_req_ids
 
     def start_load_kv(self, metadata: NixlConnectorMetadata):
@@ -735,13 +872,19 @@ class NixlConnectorWorker:
         # saturate IB with heterogeneous TP sizes. We should remove the staging
         # blocks until we are ready.
 
+        # Number of D TP workers that will read from dst P. Propagate tp_ratio
+        # on notification so that dst worker can wait before freeing blocks.
+        tp_ratio = self._tp_size[
+            self.engine_id] // self._tp_size[dst_engine_id]
+        notif_id = f"{request_id}:{tp_ratio}".encode()
+
         # Full prefix cache hit: do not need to read remote blocks,
         # just notify P worker that we have the blocks we need.
         num_local_blocks = len(local_block_ids)
         if num_local_blocks == 0:
-            agent_name = self._remote_agents[dst_engine_id]
-            self.nixl_wrapper.send_notif(agent_name,
-                                         notif_msg=request_id.encode("utf-8"))
+            remote_rank = self.tp_rank // tp_ratio
+            agent_name = self._remote_agents[dst_engine_id][remote_rank]
+            self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id)
             return
 
         # Partial prefix cache hit: just read uncomputed blocks.
@@ -754,6 +897,10 @@ class NixlConnectorWorker:
         local_xfer_side_handle = self.src_xfer_side_handle
         remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id]
 
+        # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
+        # corresponding rank. With heterogeneous TP, fixing D>P, the D tp
+        # workers will issue xfers to parts of the P worker remote kv caches.
+
         # Get descs ids.
         local_block_descs_ids: list[int] = []
         remote_block_descs_ids: list[int] = []
@@ -797,14 +944,16 @@ class NixlConnectorWorker:
             local_block_descs_ids,
             remote_xfer_side_handle,
             remote_block_descs_ids,
-            notif_msg=request_id.encode("utf-8"),
+            notif_msg=notif_id,
         )
 
         # Begin async xfer.
         self.nixl_wrapper.transfer(handle)
 
         # Use handle to check completion in future step().
-        self._recving_transfers[request_id].append(handle)
+        # TODO (NickLucche) surface xfer elapsed time
+        self._recving_transfers[request_id].append(
+            (handle, time.perf_counter()))
 
     def _get_block_descs_ids(self,
                              engine_id: str,
@@ -815,7 +964,6 @@ class NixlConnectorWorker:
         If layer_idx is provided, we use the region_ids for the given layer.
         Otherwise, we use all regions.
         """
-
         if layer_idx is None:
             region_ids = range(self.num_regions)
         else:
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index a9f748d02..91a7c43cd 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -16,6 +16,8 @@ from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8,
                                            get_flash_attn_version)
 from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    get_kv_connector_cache_layout)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
@@ -70,6 +72,20 @@ class FlashAttentionBackend(AttentionBackend):
             raise ValueError("Block size must be a multiple of 16.")
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
+    @staticmethod
+    def get_kv_cache_stride_order() -> tuple[int, ...]:
+        # NOTE When running disaggregated PD with NIXL, HND layout is used for
+        # faster transfer. `stride_order` indicates the permutation that gets
+        # us from `get_kv_cache_shape` to the actual memory layout we want.
+        cache_layout = get_kv_connector_cache_layout()
+        if cache_layout == "NHD":
+            stride_order = (0, 1, 2, 3, 4)
+        elif cache_layout == "HND":
+            stride_order = (0, 1, 3, 2, 4)
+        else:
+            raise ValueError("Unknown cache layout format %s.", cache_layout)
+        return stride_order
+
 
 @dataclass
 class FlashAttentionMetadata:
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index db1ca2d8f..0b37caa71 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -597,7 +597,8 @@ class WorkerWrapperBase:
 
     def initialize_from_config(self, kv_cache_configs: List[Any]) -> None:
         kv_cache_config = kv_cache_configs[self.rpc_rank]
-        self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+        with set_current_vllm_config(self.vllm_config):
+            self.worker.initialize_from_config(kv_cache_config)  # type: ignore
 
     def init_device(self):
         with set_current_vllm_config(self.vllm_config):
-- 
GitLab


From 78dcf56cb31f8abb7d9ae3ba02ce9822eba34820 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 5 Jun 2025 09:13:50 +0800
Subject: [PATCH 179/274] [doc] small fix (#19167)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/models/extensions/tensorizer.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md
index b6feb405c..e0b4479c0 100644
--- a/docs/models/extensions/tensorizer.md
+++ b/docs/models/extensions/tensorizer.md
@@ -10,7 +10,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
 
 For more information on CoreWeave's Tensorizer, please refer to
 [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
-the [vLLM example script](https://docs.vllm.ai/en/latest/examples/tensorize_vllm_model.html).
+the [vLLM example script](https://docs.vllm.ai/en/latest/examples/others/tensorize_vllm_model.html).
 
 !!! note
     Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
-- 
GitLab


From c56ed8bb0e0e335033797044e33acc2b00e06b79 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Wed, 4 Jun 2025 22:07:32 -0400
Subject: [PATCH 180/274] [Bugfix][Nixl] Fix full prefix cache hit bug (#18632)

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 .../kv_connector/unit/test_multi_connector.py | 76 ++++++++++++-------
 .../kv_connector/v1/multi_connector.py        | 41 ++++++----
 .../kv_connector/v1/nixl_connector.py         | 53 +++++--------
 vllm/v1/core/sched/scheduler.py               |  8 +-
 4 files changed, 97 insertions(+), 81 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index ddf2836d0..d90ddcdbb 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -12,6 +12,7 @@ from vllm.distributed.kv_transfer.kv_connector.factory import (
     KVConnectorFactory)
 from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
     SharedStorageConnector)
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 
@@ -32,7 +33,7 @@ class TestSharedStorageConnector(SharedStorageConnector):
         self.call_record: dict[str, int] = defaultdict(int)
         # Use a unique temp file per connector
         self._event_file = tempfile.gettempdir(
-        ) + f"/connector_{self.name}_events.log"
+        ) + f"/connector_{self.name}-{self.role.name}_events.log"
         # Start with an empty file
         with open(self._event_file, "w") as _:
             pass
@@ -52,10 +53,19 @@ class TestSharedStorageConnector(SharedStorageConnector):
 
             def wrapper(*args, **kwargs):
                 self.call_record[name] += 1
+
+                # Include args that we're interested in
+                to_log = [name]
+                for arg in args:
+                    if isinstance(arg, int):
+                        to_log.append(str(arg))
+                    elif isinstance(arg, KVCacheBlocks):
+                        to_log.append(f"num_blocks={len(arg.blocks)}")
+
                 # Log the event as a line to the file
                 try:
                     with open(self._event_file, "a") as f:
-                        f.write(name + "\n")
+                        f.write(' '.join(to_log) + "\n")
                 except Exception as e:
                     print(f"[ERROR] Could not log event {name} "
                           f"for {self.name}: {e}")
@@ -162,15 +172,23 @@ def test_multi_shared_storage_connector_consistency():
              f"{storage_1_path} and {storage_2_path}")
 
     events = get_connector_events()
-    # get_num_new_matched_tokens will be called on each connector in turn.
-    # neither of them have hits so update_state_after_alloc won't be called.
-    assert events["storage1"][:3] == [
-        'get_num_new_matched_tokens', 'build_connector_meta',
-        'bind_connector_metadata'
+    # get_num_new_matched_tokens and update_state_after_alloc will be called
+    # on each connector in turn.
+    assert events["storage1-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=0 0', 'build_connector_meta'
+    ]
+    assert events["storage1-WORKER"][:5] == [
+        'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
+        'wait_for_layer_load', 'save_kv_layer'
+    ]
+    assert events["storage2-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=0 0', 'build_connector_meta'
     ]
-    assert events["storage2"][:3] == [
-        'get_num_new_matched_tokens', 'build_connector_meta',
-        'bind_connector_metadata'
+    assert events["storage2-WORKER"][:5] == [
+        'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
+        'wait_for_layer_load', 'save_kv_layer'
     ]
 
     # Reset prefix cache or else we'll just get the tokens back from there.
@@ -182,16 +200,16 @@ def test_multi_shared_storage_connector_consistency():
 
     events = get_connector_events()
     # get_num_new_matched_tokens will return new tokens from the first
-    # connector so update_state_after_alloc will be called once blocks
-    # are allocated for the first connector.
-    # get_num_new_matched_tokens *won't* be called on the second connector
-    # in this case.
-    assert events["storage1"][:4] == [
-        'get_num_new_matched_tokens', 'update_state_after_alloc',
-        'build_connector_meta', 'bind_connector_metadata'
+    # connector so update_state_after_alloc will be with allocated blocks
+    # on that one but with zero blocks for others (first nonzero match is
+    # chosen).
+    assert events["storage1-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=7 96', 'build_connector_meta'
     ]
-    assert events["storage2"][:2] == [
-        'build_connector_meta', 'bind_connector_metadata'
+    assert events["storage2-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=0 0', 'build_connector_meta'
     ]
 
     # Delete storage1 connector state
@@ -205,17 +223,17 @@ def test_multi_shared_storage_connector_consistency():
     _ = llm.generate(PROMPTS, SAMPLING_PARAMS)
 
     events = get_connector_events()
-    # get_num_new_matched_tokens will be called for the first connector but it
-    # won't have a hit so update_state_after_alloc won't be called.
-    # get_num_new_matched_tokens will also be called on the second connector,
-    # but it should have a hit so update_state_after_alloc will be called.
-    assert events["storage1"][:3] == [
-        'get_num_new_matched_tokens', 'build_connector_meta',
-        'bind_connector_metadata'
+    # get_num_new_matched_tokens will be called for both connectors but will
+    # return 0 from the first connector, but the second connector should have
+    # a hit, so update_state_after_alloc will only be called with allocated
+    # blocks for the second connector.
+    assert events["storage1-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=0 0', 'build_connector_meta'
     ]
-    assert events["storage2"][:4] == [
-        'get_num_new_matched_tokens', 'update_state_after_alloc',
-        'build_connector_meta', 'bind_connector_metadata'
+    assert events["storage2-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=7 96', 'build_connector_meta'
     ]
 
     # Clean up
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index 5aab10b2b..b2cf88f5b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -12,12 +12,12 @@ from vllm.distributed.kv_transfer.kv_connector.factory import (
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
 from vllm.logger import init_logger
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
-    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -51,8 +51,9 @@ class MultiConnector(KVConnectorBase_V1):
             self._connectors.append(
                 KVConnectorFactory.create_connector_v1(temp_config, role))
 
-        # A mapping from request id to the connector that is assigned to it.
-        self._requests_to_connector: dict[str, KVConnectorBase_V1] = {}
+        # A mapping from request id to the index of the connector chosen to
+        # load the request from (if any).
+        self._requests_to_connector: dict[str, int] = {}
 
         # Keeps track of *additional* remaining async saves (beyond 1) to be
         # finished per request. Not needed for async loads since we only allow
@@ -136,25 +137,31 @@ class MultiConnector(KVConnectorBase_V1):
         request: "Request",
         num_computed_tokens: int,
     ) -> tuple[int, bool]:
-        for c in self._connectors:
+        to_return = (0, False)
+        for i, c in enumerate(self._connectors):
             toks, load_async = c.get_num_new_matched_tokens(
                 request, num_computed_tokens)
             # The first connector that has new matched tokens will be assigned
             # to this request.
-            if toks > 0:
-                self._requests_to_connector[request.request_id] = c
-                return toks, load_async
-        return 0, False
+            if to_return[0] == 0 and toks > 0:
+                self._requests_to_connector[request.request_id] = i
+                to_return = (toks, load_async)
+        return to_return
 
     def update_state_after_alloc(self, request: "Request",
                                  blocks: "KVCacheBlocks",
                                  num_external_tokens: int):
-        # If the request is not assigned to any connector, we do nothing.
-        if request.request_id not in self._requests_to_connector:
-            return
-        # We assume that the request is assigned to only one connector.
-        c = self._requests_to_connector.pop(request.request_id)
-        c.update_state_after_alloc(request, blocks, num_external_tokens)
+        chosen_connector = self._requests_to_connector.get(
+            request.request_id, -1)
+        for i, c in enumerate(self._connectors):
+            if i == chosen_connector:
+                # Forward call to the chosen connector (if any).
+                c.update_state_after_alloc(request, blocks,
+                                           num_external_tokens)
+            else:
+                # Call with empty blocks for other connectors.
+                c.update_state_after_alloc(request,
+                                           KVCacheBlocks.create_empty(), 0)
 
     def build_connector_meta(
             self,
@@ -170,7 +177,7 @@ class MultiConnector(KVConnectorBase_V1):
     def request_finished(
         self,
         request: "Request",
-        blocks: "KVCacheBlocks",
+        blocks: list[int],
     ) -> tuple[bool, Optional[dict[str, Any]]]:
         async_saves = 0
         kv_txfer_params = None
@@ -187,4 +194,8 @@ class MultiConnector(KVConnectorBase_V1):
                 kv_txfer_params = txfer_params
         if async_saves > 1:
             self._extra_async_saves[request.request_id] = async_saves - 1
+
+        # Clean up other state for this request.
+        self._requests_to_connector.pop(request.request_id, None)
+
         return async_saves > 0, kv_txfer_params
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 400fa29c9..f59970397 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -221,15 +221,6 @@ class NixlConnectorScheduler:
             if count > 0:
                 return count, True
 
-            # NOTE: if count is 0 here, we have less than block_size
-            # tokens to pull after subtracting the local prefix cache hit.
-            # The remote only sends fully computed blocks, so there is
-            # nothing to transfer but we still need to notify the
-            # prefill worker so that the remote blocks are freed.
-            if all(p in params for p in ("remote_engine_id", "remote_host",
-                                         "remote_port")):
-                self._reqs_need_recv[request.request_id] = (request, [])
-
         # No remote prefill for this request.
         return 0, False
 
@@ -247,9 +238,14 @@ class NixlConnectorScheduler:
             if params.get("remote_block_ids"):
                 if all(p in params for p in ("remote_engine_id", "remote_host",
                                              "remote_port")):
+                    # If remote_blocks and num_external_tokens = 0, we have
+                    # a full prefix cache hit on the D worker. We need to call
+                    # send_notif in _read_blocks to free the memory on the P.
+                    local_block_ids = (blocks.get_unhashed_block_ids()
+                                       if num_external_tokens > 0 else [])
                     # Get unhashed blocks to pull from remote.
                     self._reqs_need_recv[request.request_id] = (
-                        request, blocks.get_unhashed_block_ids())
+                        request, local_block_ids)
                 else:
                     logger.warning(
                         "Got invalid KVTransferParams: %s. This "
@@ -268,15 +264,6 @@ class NixlConnectorScheduler:
         # Loop through scheduled reqs and convert to ReqMeta.
         for req_id, (req, block_ids) in self._reqs_need_recv.items():
             assert req.kv_transfer_params is not None
-            # For the case where there are no remote blocks to pull
-            # (block_ids is empty), we don't need to schedule
-            # an async read on the worker side.
-            if not block_ids:
-                logger.debug(
-                    "Skipping adding request %s to NixlConnectorMetadata, "
-                    "as there are no remote blocks to pull", req_id)
-                continue
-
             meta.add_new_req(
                 request_id=req_id,
                 local_block_ids=block_ids,
@@ -660,26 +647,26 @@ class NixlConnectorWorker:
 
         # Number of D TP workers reading from a single P TP worker. This is
         # 1 when P and D `--tensor-parallel-size` match.
-        assert self._tp_size[self.engine_id] % self._tp_size[engine_id] == 0, \
-        "Local TP size must be divisible by remote TP size."
+        assert self._tp_size[self.engine_id] % self._tp_size[engine_id] == 0, (
+            "Local TP size must be divisible by remote TP size.")
         tp_ratio = self._tp_size[self.engine_id] // self._tp_size[engine_id]
-        assert tp_ratio > 0, "Decode TP cannot be smaller than"
-        " prefill TP"
+        assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
         if self.use_mla:
             # With MLA the only difference is in the number of blocks.
-            remote_block_size = nixl_agent_meta.block_len / (
+            remote_block_size = nixl_agent_meta.block_len // (
                 self.slot_size_bytes)
             assert self.block_len == nixl_agent_meta.block_len
         else:
-            remote_block_size = nixl_agent_meta.block_len / (
+            remote_block_size = nixl_agent_meta.block_len // (
                 self.slot_size_bytes * tp_ratio)
 
-            assert nixl_agent_meta.block_len == self.block_len * tp_ratio, \
-            "Remote P worker KV layer cache must be of shape [2, N, \
-            local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
+            assert nixl_agent_meta.block_len == self.block_len * tp_ratio, (
+                "Remote P worker KV layer cache must be of shape [2, N, "
+                "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
+            )
 
-        assert self.block_size == remote_block_size, "Remote P worker with \
-        different block size is not supported"
+        assert self.block_size == remote_block_size, "Remote P worker with "
+        "different block size is not supported"
 
         assert self.num_blocks >= nixl_agent_meta.num_blocks
 
@@ -712,9 +699,9 @@ class NixlConnectorWorker:
                     # (addr, len, device id)
                     blocks_data.append((addr, self.block_len, remote_tp_rank))
             logger.debug(
-                "Created %s blocks for dst engine %s with remote rank %s and " \
-                "local rank %s",
-                len(blocks_data), engine_id, remote_tp_rank, self.tp_rank)
+                "Created %s blocks for dst engine %s with remote rank %s and "
+                "local rank %s", len(blocks_data), engine_id, remote_tp_rank,
+                self.tp_rank)
 
             # Register with NIXL.
             descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 32d03b311..6a4c6bcf4 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -424,11 +424,11 @@ class Scheduler(SchedulerInterface):
                     # The request cannot be scheduled.
                     break
 
-                # KVConnector: update internal state after allocation.
+                # KVTransfer: the connector uses this info to determine
+                # if a load is needed. Note that
                 # This information is used to determine if a load is
                 # needed for this request.
-                if num_external_computed_tokens:
-                    assert self.connector is not None
+                if self.connector is not None:
                     self.connector.update_state_after_alloc(
                         request,
                         new_computed_blocks + new_blocks,
@@ -841,7 +841,7 @@ class Scheduler(SchedulerInterface):
         }
 
         finished_req_ids = self.finished_req_ids_dict
-        if finished_req_ids is not None:
+        if finished_req_ids:
             # Include ids of requests that finished since last outputs
             # were sent.
             for client_index, finished_set in finished_req_ids.items():
-- 
GitLab


From a408820f2fcdd4025f05f8a43dc15604fe534367 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 4 Jun 2025 23:00:59 -0400
Subject: [PATCH 181/274] [Bugfix] Fix port handling in make_zmq_path (#19117)

---
 vllm/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 41336b80e..4f905e505 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2461,7 +2461,7 @@ def make_zmq_path(scheme: str, host: str, port: Optional[int] = None) -> str:
     Returns:
         A properly formatted ZMQ path string.
     """
-    if not port:
+    if port is None:
         return f"{scheme}://{host}"
     if is_valid_ipv6_address(host):
         return f"{scheme}://[{host}]:{port}"
-- 
GitLab


From 25b918eee6438d4d014ad7c900c43325a6908fba Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 4 Jun 2025 21:56:12 -0700
Subject: [PATCH 182/274] [Torch Nightly]add missing dependency (#18770)

Signed-off-by: Yang Wang <elainewy@meta.com>
---
 docker/Dockerfile.nightly_torch     | 3 +++
 requirements/nightly_torch_test.txt | 9 +++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index 53b8ccd80..8d43de77a 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -312,4 +312,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Logging to confirm the torch versions
 RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
 
+# Logging to confirm all the packages are installed
+RUN pip freeze
+
 #################### UNITTEST IMAGE #############################
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index e9b466d3a..3475ada9f 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -9,7 +9,9 @@ pytest-shard
 pytest-timeout
 
 librosa # required by audio tests in entrypoints/openai
-sentence-transformers
+sentence-transformers # required for embedding tests
+transformers==4.51.3
+transformers_stream_generator # required for qwen-vl test
 numba == 0.61.2; python_version > '3.9'
 # testing utils
 boto3
@@ -38,4 +40,7 @@ matplotlib # required for qwen-vl test
 # required for  Multi-Modal Models Test (Standard)
 num2words # required for smolvlm test
 pqdm
-timm # required for internvl test
\ No newline at end of file
+timm # required for internvl test
+
+schemathesis>=3.39.15  # Required for openai schema test.
+mteb>=1.38.11, <2 # required for mteb test
-- 
GitLab


From 0678b52251edd75405a1db81132c2d63d86b16b6 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 4 Jun 2025 22:40:04 -0700
Subject: [PATCH 183/274] Handle non-serializable objects when dumping
 benchmark results (#19114)

---
 .pre-commit-config.yaml       | 2 ++
 benchmarks/benchmark_utils.py | 7 ++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 628782228..a105b0e14 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,6 +11,8 @@ repos:
   hooks:
   - id: yapf
     args: [--in-place, --verbose]
+    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
+    exclude: '(.buildkite|benchmarks|build|examples)/.*'
 - repo: https://github.com/astral-sh/ruff-pre-commit
   rev: v0.11.7
   hooks:
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index 272b7979c..283f938df 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -66,4 +66,9 @@ class InfEncoder(json.JSONEncoder):
 
 def write_to_json(filename: str, records: list) -> None:
     with open(filename, "w") as f:
-        json.dump(records, f, cls=InfEncoder)
+        json.dump(
+            records,
+            f,
+            cls=InfEncoder,
+            default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
+        )
-- 
GitLab


From af7fc84fd2e9b0fb70bc6349730b3ee73f0e1f8d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 4 Jun 2025 22:41:25 -0700
Subject: [PATCH 184/274] [BugFix][Minor] Fix full cuda graph bug when
 max_num_seqs < 512 (#19171)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4a67e3778..f6ccf0fa1 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1737,7 +1737,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # has num_tokens in total.
         assert num_tokens <= self.scheduler_config.max_num_batched_tokens
         max_num_reqs = self.scheduler_config.max_num_seqs
-        num_reqs = max_num_reqs if num_tokens >= max_num_reqs else num_tokens
+        num_reqs = min(num_tokens, max_num_reqs)
         min_tokens_per_req = num_tokens // num_reqs
         num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
         num_scheduled_tokens_list[-1] += num_tokens % num_reqs
@@ -1765,7 +1765,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                     self.kv_cache_config.kv_cache_groups):
                 attn_metadata_i = (
                     self.attn_metadata_builders[kv_cache_group_id].build(
-                        num_reqs=num_tokens,
+                        num_reqs=num_reqs,
                         num_actual_tokens=num_tokens,
                         max_query_len=num_tokens,
                         common_prefix_len=0,
-- 
GitLab


From 8fc57501d3d35291c5f4ae036a6b6371f9f135b6 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Thu, 5 Jun 2025 14:24:24 +0800
Subject: [PATCH 185/274] [Bugfix]: Fix the incompatibility issue with stream
 when Thinking is disabled (#19135)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 .../test_completion_with_function_calling.py  | 116 +++++++++++++-----
 vllm/entrypoints/openai/serving_chat.py       |  16 ++-
 2 files changed, 101 insertions(+), 31 deletions(-)

diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index 5c1f07832..5a1832865 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import NamedTuple
+
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
@@ -22,7 +24,9 @@ def server():  # noqa: F811
         "--guided-decoding-backend",
         "xgrammar",
         "--tool-call-parser",
-        "hermes"
+        "hermes",
+        "--reasoning-parser",
+        "qwen3",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -35,9 +39,53 @@ async def client(server):
         yield async_client
 
 
+class TestCase(NamedTuple):
+    model_name: str
+    stream: bool
+    tool_choice: str
+    enable_thinking: bool
+
+
 @pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_required_tool_use(client: openai.AsyncOpenAI, model_name: str):
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        TestCase(model_name=MODEL_NAME,
+                 stream=True,
+                 tool_choice="auto",
+                 enable_thinking=False),
+        TestCase(model_name=MODEL_NAME,
+                 stream=False,
+                 tool_choice="auto",
+                 enable_thinking=False),
+        TestCase(model_name=MODEL_NAME,
+                 stream=True,
+                 tool_choice="required",
+                 enable_thinking=False),
+        TestCase(model_name=MODEL_NAME,
+                 stream=False,
+                 tool_choice="required",
+                 enable_thinking=False),
+        TestCase(model_name=MODEL_NAME,
+                 stream=True,
+                 tool_choice="auto",
+                 enable_thinking=True),
+        TestCase(model_name=MODEL_NAME,
+                 stream=False,
+                 tool_choice="auto",
+                 enable_thinking=True),
+        TestCase(model_name=MODEL_NAME,
+                 stream=True,
+                 tool_choice="required",
+                 enable_thinking=True),
+        TestCase(model_name=MODEL_NAME,
+                 stream=False,
+                 tool_choice="required",
+                 enable_thinking=True),
+    ],
+)
+async def test_function_tool_use(client: openai.AsyncOpenAI,
+                                 test_case: TestCase):
     tools = [
         {
             "type": "function",
@@ -126,30 +174,38 @@ async def test_required_tool_use(client: openai.AsyncOpenAI, model_name: str):
             "forecast for the next 5 days, in fahrenheit?",
         },
     ]
-
-    # Non-streaming test
-    chat_completion = await client.chat.completions.create(
-        messages=messages,
-        model=model_name,
-        tools=tools,
-        tool_choice="required",
-    )
-
-    assert chat_completion.choices[0].message.tool_calls is not None
-    assert len(chat_completion.choices[0].message.tool_calls) > 0
-
-    # Streaming test
-    stream = await client.chat.completions.create(
-        messages=messages,
-        model=model_name,
-        tools=tools,
-        tool_choice="required",
-        stream=True,
-    )
-
-    output = []
-    async for chunk in stream:
-        if chunk.choices and chunk.choices[0].delta.tool_calls:
-            output.extend(chunk.choices[0].delta.tool_calls)
-
-    assert len(output) > 0
+    if not test_case.stream:
+        # Non-streaming test
+        chat_completion = await client.chat.completions.create(
+            messages=messages,
+            model=test_case.model_name,
+            tools=tools,
+            tool_choice=test_case.tool_choice,
+            extra_body={
+                "chat_template_kwargs": {
+                    "enable_thinking": test_case.enable_thinking
+                }
+            })
+
+        assert chat_completion.choices[0].message.tool_calls is not None
+        assert len(chat_completion.choices[0].message.tool_calls) > 0
+    else:
+        # Streaming test
+        stream = await client.chat.completions.create(
+            messages=messages,
+            model=test_case.model_name,
+            tools=tools,
+            tool_choice=test_case.tool_choice,
+            stream=True,
+            extra_body={
+                "chat_template_kwargs": {
+                    "enable_thinking": test_case.enable_thinking
+                }
+            })
+
+        output = []
+        async for chunk in stream:
+            if chunk.choices and chunk.choices[0].delta.tool_calls:
+                output.extend(chunk.choices[0].delta.tool_calls)
+
+        assert len(output) > 0
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 777b7f5bc..79eac184a 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -689,7 +689,21 @@ class OpenAIServingChat(OpenAIServing):
                                     current_token_ids,
                                     output.token_ids,
                                 ))
-
+                            # When encountering think end id in prompt_token_ids
+                            # i.e {"enable_thinking": False},
+                            # set reasoning status to end.
+                            # Remove the text and token ids related
+                            # to 'reasoning_content'.
+                            if res.prompt_token_ids and \
+                                reasoning_parser.is_reasoning_end(
+                                    list(res.prompt_token_ids)):
+                                reasoning_end_arr[i] = True
+                                current_token_ids = list(output.token_ids)
+                                if delta_message and delta_message.content:
+                                    current_text = delta_message.content
+                                    delta_message.content = None
+                                else:
+                                    current_text = ""
                             # When encountering think end id in delta_token_ids,
                             # set reasoning status to end.
                             # Remove the text and token ids related
-- 
GitLab


From da4038021480773e5a83b5f860681b49a7a0eafa Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 4 Jun 2025 23:24:56 -0700
Subject: [PATCH 186/274] [Build] Annotate wheel and container path for release
 workflow (#19162)

Signed-off-by: simon-mo <simon.mo@hey.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .buildkite/release-pipeline.yaml       | 19 +++++++++++++++-
 .buildkite/scripts/annotate-release.sh | 31 ++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100755 .buildkite/scripts/annotate-release.sh

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index b3c27e2c9..16b5ad029 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,5 +1,6 @@
 steps:
   - label: "Build wheel - CUDA 12.8"
+    id: build-wheel-cuda-12-8
     agents:
       queue: cpu_queue_postmerge
     commands:
@@ -11,6 +12,7 @@ steps:
       DOCKER_BUILDKIT: "1"
 
   - label: "Build wheel - CUDA 12.6"
+    id: build-wheel-cuda-12-6
     agents:
       queue: cpu_queue_postmerge
     commands:
@@ -28,6 +30,7 @@ steps:
 
   - label: "Build wheel - CUDA 11.8"
     # depends_on: block-build-cu118-wheel
+    id: build-wheel-cuda-11-8
     agents:
       queue: cpu_queue_postmerge
     commands:
@@ -44,6 +47,7 @@ steps:
 
   - label: "Build release image"
     depends_on: block-release-image-build
+    id: build-release-image
     agents:
       queue: cpu_queue_postmerge
     commands:
@@ -51,6 +55,18 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
+  - label: "Annotate release workflow"
+    depends_on:
+      - build-release-image
+      - build-wheel-cuda-12-8
+      - build-wheel-cuda-12-6
+      - build-wheel-cuda-11-8
+    id: annotate-release-workflow
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "bash .buildkite/scripts/annotate-release.sh"
+
   - label: "Build and publish TPU release image"
     depends_on: ~
     if: build.env("NIGHTLY") == "1"
@@ -70,9 +86,10 @@ steps:
       DOCKER_BUILDKIT: "1"
 
   - input: "Provide Release version here"
+    id: input-release-version
     fields:
       - text: "What is the release version?"
-        key: "release-version"
+        key: release-version
 
   - block: "Build CPU release image"
     key: block-cpu-release-image-build
diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
new file mode 100755
index 000000000..94e0ac239
--- /dev/null
+++ b/.buildkite/scripts/annotate-release.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+set -ex
+
+# Get release version and strip leading 'v' if present
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
+
+if [ -z "$RELEASE_VERSION" ]; then
+  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
+  exit 1
+fi
+
+buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
+To download the wheel:
+\`\`\`
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
+\`\`\`
+
+To download and upload the image:
+
+\`\`\`
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
+docker tag vllm/vllm-openai vllm/vllm-openai:latest
+docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
+docker push vllm/vllm-openai:latest
+docker push vllm/vllm-openai:v${RELEASE_VERSION}
+\`\`\`
+EOF 
\ No newline at end of file
-- 
GitLab


From 18093084be935fe8aad11a45366bea060b33d60f Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Thu, 5 Jun 2025 16:08:26 +0800
Subject: [PATCH 187/274] [Misc] Remove unnecessary fallback to prefill-decode
 attention (#19138)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/v1/attention/backends/triton_attn.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 968f13701..5db592b15 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -171,10 +171,7 @@ class TritonAttentionImpl(AttentionImpl):
         # Whenever making a change in this method, please benchmark the
         # performance to make sure it does not introduce any overhead.
 
-        num_queries_per_kv = query.shape[1] // key.shape[1]
-        num_q_is_pow2 = (num_queries_per_kv & (num_queries_per_kv - 1)) == 0
-        use_prefill_decode_attn = (self.force_prefill_decode_attn
-                                   or not num_q_is_pow2)
+        use_prefill_decode_attn = self.force_prefill_decode_attn
         num_actual_tokens = attn_metadata.num_actual_tokens
 
         if use_prefill_decode_attn:
-- 
GitLab


From 188a4590d852324113a80cef6d0c237d6417a9d6 Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Thu, 5 Jun 2025 04:14:32 -0700
Subject: [PATCH 188/274] [Misc] Do not override NCCL_CUMEM_ENABLE if set
 explicitly (#19105)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 vllm/env_override.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/vllm/env_override.py b/vllm/env_override.py
index b0a061d2c..2bede4963 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -4,17 +4,23 @@ import os
 
 import torch
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 # set some common config/environment variables that should be set
 # for all processes created by vllm and all processes
 # that interact with vllm workers.
 # they are executed whenever `import vllm` is called.
 
-if not os.path.exists('/dev/nvidia-caps-imex-channels'):
-    # normally, we disable NCCL_CUMEM_ENABLE because it
-    # will cost 1~2 GiB GPU memory with cudagraph+allreduce,
-    # see https://github.com/NVIDIA/nccl/issues/1234
-    # for more details.
-    # However, NCCL requires NCCL_CUMEM_ENABLE to work with
+if 'NCCL_CUMEM_ENABLE' in os.environ:
+    logger.warning(
+        "NCCL_CUMEM_ENABLE is set to %s, skipping override. "
+        "This may increase memory overhead with cudagraph+allreduce: "
+        "https://github.com/NVIDIA/nccl/issues/1234",
+        os.environ['NCCL_CUMEM_ENABLE'])
+elif not os.path.exists('/dev/nvidia-caps-imex-channels'):
+    # NCCL requires NCCL_CUMEM_ENABLE to work with
     # multi-node NVLink, typically on GB200-NVL72 systems.
     # The ultimate way to detect multi-node NVLink is to use
     # NVML APIs, which are too expensive to call here.
-- 
GitLab


From 1aeb925f34495f505bf3053e4f66b8ca4886452e Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 5 Jun 2025 19:16:25 +0800
Subject: [PATCH 189/274] [Frontend] improve vllm run-batch --help display
 (#19187)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 vllm/entrypoints/cli/main.py      |  4 ++--
 vllm/entrypoints/cli/run_batch.py |  8 +++++++-
 vllm/entrypoints/cli/serve.py     |  6 +++---
 vllm/entrypoints/utils.py         | 14 +++++++++++---
 4 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py
index 3e834b3b2..9bb1162e3 100644
--- a/vllm/entrypoints/cli/main.py
+++ b/vllm/entrypoints/cli/main.py
@@ -11,7 +11,7 @@ import vllm.entrypoints.cli.openai
 import vllm.entrypoints.cli.run_batch
 import vllm.entrypoints.cli.serve
 import vllm.version
-from vllm.entrypoints.utils import VLLM_SERVE_PARSER_EPILOG, cli_env_setup
+from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup
 from vllm.utils import FlexibleArgumentParser
 
 CMD_MODULES = [
@@ -37,7 +37,7 @@ def main():
 
     parser = FlexibleArgumentParser(
         description="vLLM CLI",
-        epilog=VLLM_SERVE_PARSER_EPILOG,
+        epilog=VLLM_SUBCMD_PARSER_EPILOG,
     )
     parser.add_argument('-v',
                         '--version',
diff --git a/vllm/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py
index 353034f88..6bdd3b63c 100644
--- a/vllm/entrypoints/cli/run_batch.py
+++ b/vllm/entrypoints/cli/run_batch.py
@@ -10,6 +10,8 @@ from vllm.entrypoints.cli.types import CLISubcommand
 from vllm.entrypoints.logger import logger
 from vllm.entrypoints.openai.run_batch import main as run_batch_main
 from vllm.entrypoints.openai.run_batch import make_arg_parser
+from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
+                                    show_filtered_argument_or_group_from_help)
 from vllm.utils import FlexibleArgumentParser
 from vllm.version import __version__ as VLLM_VERSION
 
@@ -49,7 +51,11 @@ class RunBatchSubcommand(CLISubcommand):
             usage=
             "vllm run-batch -i INPUT.jsonl -o OUTPUT.jsonl --model <model>",
         )
-        return make_arg_parser(run_batch_parser)
+        run_batch_parser = make_arg_parser(run_batch_parser)
+        show_filtered_argument_or_group_from_help(run_batch_parser,
+                                                  "run-batch")
+        run_batch_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG
+        return run_batch_parser
 
 
 def cmd_init() -> list[CLISubcommand]:
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index f9c56e655..51807a953 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -16,7 +16,7 @@ from vllm.entrypoints.openai.api_server import (run_server, run_server_worker,
                                                 setup_server)
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
                                               validate_parsed_serve_args)
-from vllm.entrypoints.utils import (VLLM_SERVE_PARSER_EPILOG,
+from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
                                     show_filtered_argument_or_group_from_help)
 from vllm.executor.multiproc_worker_utils import _add_prefix
 from vllm.logger import init_logger
@@ -101,8 +101,8 @@ class ServeSubcommand(CLISubcommand):
         )
 
         serve_parser = make_arg_parser(serve_parser)
-        show_filtered_argument_or_group_from_help(serve_parser)
-        serve_parser.epilog = VLLM_SERVE_PARSER_EPILOG
+        show_filtered_argument_or_group_from_help(serve_parser, "serve")
+        serve_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG
         return serve_parser
 
 
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 6fb32ff18..16ba2b453 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -14,8 +14,9 @@ from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
-VLLM_SERVE_PARSER_EPILOG = (
-    "Tip: Use `vllm serve --help=<keyword>` to explore arguments from help.\n"
+VLLM_SUBCMD_PARSER_EPILOG = (
+    "Tip: Use `vllm [serve|run-batch] --help=<keyword>` "
+    "to explore arguments from help.\n"
     "   - To view a argument group:     --help=ModelConfig\n"
     "   - To view a single argument:    --help=max-num-seqs\n"
     "   - To search by keyword:         --help=max\n"
@@ -173,8 +174,15 @@ def _validate_truncation_size(
     return truncate_prompt_tokens
 
 
-def show_filtered_argument_or_group_from_help(parser):
+def show_filtered_argument_or_group_from_help(parser, subcommand_name):
     import sys
+
+    # Only handle --help=<keyword> for the current subcommand.
+    # Since subparser_init() runs for all subcommands during CLI setup,
+    # we skip processing if the subcommand name is not in sys.argv.
+    if subcommand_name not in sys.argv:
+        return
+
     for arg in sys.argv:
         if arg.startswith('--help='):
             search_keyword = arg.split('=', 1)[1]
-- 
GitLab


From 9bc8bb07cf16c8a9116b8029e63d1015d150cccb Mon Sep 17 00:00:00 2001
From: Guillaume Calmettes <gcalmettes@scaleway.com>
Date: Thu, 5 Jun 2025 14:59:28 +0200
Subject: [PATCH 190/274] [Bugfix] properly catch PIL-related errors for vision
 models when incorrect data urls are provided (#19202)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
---
 tests/multimodal/test_utils.py | 13 +++++++++++++
 vllm/multimodal/utils.py       | 30 +++++++++++++++++++-----------
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index e4debb47c..c8a544822 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -141,6 +141,19 @@ async def test_fetch_image_local_files(image_url: str):
                 f"file://{temp_dir}/../{os.path.basename(image_url)}")
 
 
+@pytest.mark.asyncio
+async def test_fetch_image_error_conversion():
+    connector = MediaConnector()
+    broken_img = "data:image/png;base64,aGVsbG9fdmxsbV9jb21tdW5pdHkK"
+
+    # PIL.UnidentifiedImageError should be converted to ValueError
+    with pytest.raises(ValueError):
+        await connector.fetch_image_async(broken_img)
+
+    with pytest.raises(ValueError):
+        connector.fetch_image(broken_img)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 @pytest.mark.parametrize("num_frames", [-1, 32, 1800])
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 2b34cdf40..11a25f851 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -9,7 +9,7 @@ from urllib.parse import ParseResult, urlparse
 import numpy as np
 import numpy.typing as npt
 import torch
-from PIL import Image
+from PIL import Image, UnidentifiedImageError
 
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
@@ -185,11 +185,15 @@ class MediaConnector:
         """
         image_io = ImageMediaIO(image_mode=image_mode)
 
-        return self.load_from_url(
-            image_url,
-            image_io,
-            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
-        )
+        try:
+            return self.load_from_url(
+                image_url,
+                image_io,
+                fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+            )
+        except UnidentifiedImageError as e:
+            # convert to ValueError to be properly caught upstream
+            raise ValueError(str(e)) from e
 
     async def fetch_image_async(
         self,
@@ -204,11 +208,15 @@ class MediaConnector:
         """
         image_io = ImageMediaIO(image_mode=image_mode)
 
-        return await self.load_from_url_async(
-            image_url,
-            image_io,
-            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
-        )
+        try:
+            return await self.load_from_url_async(
+                image_url,
+                image_io,
+                fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+            )
+        except UnidentifiedImageError as e:
+            # convert to ValueError to be properly caught upstream
+            raise ValueError(str(e)) from e
 
     def fetch_video(
         self,
-- 
GitLab


From f20f9f063b8a61a96f055470c03dd26de8a2c19e Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 5 Jun 2025 17:27:41 +0200
Subject: [PATCH 191/274] [mistral_common] Add v11 tokenizer (#19193)

Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../tool_parsers/mistral_tool_parser.py       | 34 ++++++++++++++++---
 vllm/transformers_utils/tokenizers/mistral.py |  2 ++
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index ef5b14f3c..ab1cfd4b6 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -44,11 +44,17 @@ class MistralToolCall(ToolCall):
         return id.isalnum() and len(id) == 9
 
 
+def _is_fn_name_regex_support(model_tokenizer: AnyTokenizer) -> bool:
+    return isinstance(model_tokenizer, MistralTokenizer) \
+        and model_tokenizer.version >= 11
+
+
 @ToolParserManager.register_module("mistral")
 class MistralToolParser(ToolParser):
     """
-    Tool call parser for Mistral 7B Instruct v0.3, intended for use with the
-    examples/tool_chat_template_mistral.jinja template.
+    Tool call parser for Mistral 7B Instruct v0.3, intended for use with
+    - [`mistral_common`](https://github.com/mistralai/mistral-common/)
+    - the examples/tool_chat_template_mistral.jinja template.
 
     Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
     """
@@ -70,6 +76,12 @@ class MistralToolParser(ToolParser):
         self.bot_token = "[TOOL_CALLS]"
         self.bot_token_id = self.vocab.get(self.bot_token)
         self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
+        if _is_fn_name_regex_support(self.model_tokenizer):
+            self.fn_name_regex = re.compile(r'([a-zA-Z0-9_-]+)(\{.*?\})',
+                                            re.DOTALL)
+        else:
+            self.fn_name_regex = None
+
         if self.bot_token_id is None:
             raise RuntimeError(
                 "Mistral Tool Parser could not locate the tool call token in "
@@ -109,11 +121,25 @@ class MistralToolParser(ToolParser):
         tool_content = model_output.replace(self.bot_token, "").strip()
 
         try:
-
             # we first try to directly load the json as parsing very nested
             # jsons is difficult
             try:
-                function_call_arr = json.loads(tool_content)
+                if self.fn_name_regex:
+                    matches = self.fn_name_regex.findall(tool_content)
+
+                    function_call_arr = []
+                    for match in matches:
+                        fn_name = match[0]
+                        args = match[1]
+
+                        # fn_name is encoded outside serialized json dump
+                        # only arguments are serialized
+                        function_call_arr.append({
+                            "name": fn_name,
+                            "arguments": json.loads(args)
+                        })
+                else:
+                    function_call_arr = json.loads(tool_content)
             except json.JSONDecodeError:
                 # use a regex to find the part corresponding to the tool call.
                 # NOTE: This use case should not happen if the model is trained
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index fcc0f538f..24ac4580d 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -187,6 +187,8 @@ class MistralTokenizer(TokenizerBase):
     def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
         self.mistral = tokenizer
         self.instruct = tokenizer.instruct_tokenizer
+        _mistral_version_str = self.instruct.tokenizer.version.value
+        self.version: int = int(_mistral_version_str.split("v")[-1])
 
         tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
         from mistral_common.tokens.tokenizers.tekken import (
-- 
GitLab


From ec89524f50cccd6f787d604ea2b3479a126e991a Mon Sep 17 00:00:00 2001
From: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com>
Date: Fri, 6 Jun 2025 00:38:54 +0800
Subject: [PATCH 192/274] Add H20-3e fused MoE kernel tuning configs for
 DeepSeek-R1/V3 (#19205)

---
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000..9264ca17f
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
-- 
GitLab


From 61059bee40511b6f6c044053cf921da81cf89985 Mon Sep 17 00:00:00 2001
From: Chiyue Wei <92623189+dubcyfor3@users.noreply.github.com>
Date: Thu, 5 Jun 2025 09:48:26 -0700
Subject: [PATCH 193/274] [Hardware][NVIDIA] FP4 MoE kernel optimization
 (#19110)

Signed-off-by: Chiyue Wei <chiyuew@nvidia.com>
Co-authored-by: Chiyue Wei <chiyuew@nvidia.com>
---
 .../kernels/benchmark_cutlass_fp4_moe.py      |  2 +-
 csrc/moe/moe_ops.h                            |  6 +-
 csrc/moe/moe_permute_unpermute_op.cu          | 56 +++++++++++++++++++
 csrc/moe/permute_unpermute_kernels/dispatch.h | 18 ++++--
 csrc/moe/torch_bindings.cpp                   |  6 ++
 csrc/ops.h                                    |  3 +-
 .../quantization/cutlass_w8a8/moe/moe_data.cu | 36 ++++++++++--
 .../cutlass_w8a8/scaled_mm_entry.cu           |  9 ++-
 csrc/torch_bindings.cpp                       |  2 +-
 tests/kernels/moe/test_nvfp4_moe.py           |  5 +-
 vllm/_custom_ops.py                           | 45 +++++++++++----
 .../layers/fused_moe/cutlass_moe.py           | 15 ++---
 12 files changed, 165 insertions(+), 38 deletions(-)

diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
index 3383fb788..35c20ee41 100644
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@@ -91,7 +91,7 @@ def bench_run(
 
     score = torch.randn((m, num_experts), device=device, dtype=dtype)
 
-    topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+    topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
 
     quant_blocksize = 16
     w1_blockscale = torch.empty(
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 8fda434d4..c4faef731 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -30,4 +30,8 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              int64_t BLOCK_SIZE_K, int64_t bit);
 #endif
 
-bool moe_permute_unpermute_supported();
\ No newline at end of file
+bool moe_permute_unpermute_supported();
+
+void shuffle_rows(const torch::Tensor& input_tensor,
+                  const torch::Tensor& dst2src_map,
+                  torch::Tensor& output_tensor);
\ No newline at end of file
diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu
index 9a7465261..68f429fac 100644
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -130,6 +130,62 @@ void moe_unpermute(
   });
 }
 
+template <typename T>
+__global__ void shuffleInputRowsKernel(const T* input,
+                                       const int32_t* dst2src_map, T* output,
+                                       int64_t num_src_rows,
+                                       int64_t num_dst_rows, int64_t num_cols) {
+  int64_t dest_row_idx = blockIdx.x;
+  int64_t const source_row_idx = dst2src_map[dest_row_idx];
+
+  if (blockIdx.x < num_dst_rows) {
+    // Load 128-bits per thread
+    constexpr int64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8;
+    using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
+
+    // Duplicate and permute rows
+    auto const* source_row_ptr =
+        reinterpret_cast<DataElem const*>(input + source_row_idx * num_cols);
+    auto* dest_row_ptr =
+        reinterpret_cast<DataElem*>(output + dest_row_idx * num_cols);
+
+    int64_t const start_offset = threadIdx.x;
+    int64_t const stride = blockDim.x;
+    int64_t const num_elems_in_col = num_cols / ELEM_PER_THREAD;
+
+    for (int elem_index = start_offset; elem_index < num_elems_in_col;
+         elem_index += stride) {
+      dest_row_ptr[elem_index] = source_row_ptr[elem_index];
+    }
+  }
+}
+
+void shuffle_rows(const torch::Tensor& input_tensor,
+                  const torch::Tensor& dst2src_map,
+                  torch::Tensor& output_tensor) {
+  TORCH_CHECK(input_tensor.scalar_type() == output_tensor.scalar_type(),
+              "Input and output tensors must have the same data type");
+
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
+  int64_t const blocks = output_tensor.size(0);
+  int64_t const threads = 256;
+  int64_t const num_dest_rows = output_tensor.size(0);
+  int64_t const num_src_rows = input_tensor.size(0);
+  int64_t const num_cols = input_tensor.size(1);
+
+  TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
+              "num_cols must be divisible by 128 / "
+              "sizeof(input_tensor.scalar_type()) / 8");
+
+  MOE_DISPATCH(input_tensor.scalar_type(), [&] {
+    shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
+        dst2src_map.data_ptr<int32_t>(),
+        reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
+        num_dest_rows, num_cols);
+  });
+}
+
 #else
 
 void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
diff --git a/csrc/moe/permute_unpermute_kernels/dispatch.h b/csrc/moe/permute_unpermute_kernels/dispatch.h
index 41932cdd8..d0f1ea4ad 100644
--- a/csrc/moe/permute_unpermute_kernels/dispatch.h
+++ b/csrc/moe/permute_unpermute_kernels/dispatch.h
@@ -14,12 +14,13 @@
     __VA_ARGS__();                                         \
     break;                                                 \
   }
-#define MOE_DISPATCH_FLOAT_CASE(...)                          \
-  MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)       \
-  MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)        \
-  MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)    \
-  MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
-  MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
+#define MOE_DISPATCH_FLOAT_CASE(...)                            \
+  MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)         \
+  MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)          \
+  MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)      \
+  MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)   \
+  MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+  MOE_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
 
 #define MOE_DISPATCH(TYPE, ...) \
   MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
@@ -39,6 +40,11 @@ template <>
 struct ScalarType2CudaType<at::ScalarType::BFloat16> {
   using type = __nv_bfloat16;
 };
+// uint8 for packed fp4
+template <>
+struct ScalarType2CudaType<at::ScalarType::Byte> {
+  using type = uint8_t;
+};
 
 // #if __CUDA_ARCH__ >= 890
 // fp8
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 7d35ec79e..a74eb3720 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -81,6 +81,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
   m.def("moe_permute_unpermute_supported() -> bool");
   m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
 
+  // Row shuffle for MoE
+  m.def(
+      "shuffle_rows(Tensor input_tensor, Tensor dst2src_map, Tensor! "
+      "output_tensor) -> ()");
+  m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
+
 #endif
 }
 
diff --git a/csrc/ops.h b/csrc/ops.h
index 297f32b4a..6905ef6e5 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -248,7 +248,8 @@ void get_cutlass_moe_mm_data(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
     torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k);
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets);
 
 void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& b,
diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
index 894727383..ac414e1bc 100644
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@@ -45,6 +45,23 @@ __global__ void compute_expert_offsets(
   }
 }
 
+__global__ void compute_expert_blockscale_offsets(
+    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
+    int32_t* blockscale_offsets, int32_t* atomic_buffer,
+    const int num_experts) {
+  int32_t tot_offset = 0;
+  int32_t tot_offset_round = 0;
+  expert_offsets[0] = 0;
+  blockscale_offsets[0] = 0;
+  for (int i = 0; i < num_experts; ++i) {
+    atomic_buffer[i] = tot_offset;
+    tot_offset += problem_sizes1[i * 3];
+    expert_offsets[i + 1] = tot_offset;
+    tot_offset_round += (problem_sizes1[i * 3] + (128 - 1)) / 128 * 128;
+    blockscale_offsets[i + 1] = tot_offset_round;
+  }
+}
+
 __global__ void compute_arg_sorts(const int* __restrict__ topk_ids,
                                   const int32_t* __restrict__ expert_offsets,
                                   int32_t* input_permutation,
@@ -77,7 +94,8 @@ void get_cutlass_moe_mm_data_caller(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
     torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k) {
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets) {
   auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
   auto options_int32 =
       torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
@@ -89,10 +107,18 @@ void get_cutlass_moe_mm_data_caller(
       static_cast<int32_t*>(problem_sizes1.data_ptr()),
       static_cast<int32_t*>(problem_sizes2.data_ptr()),
       static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
-  compute_expert_offsets<<<1, 1, 0, stream>>>(
-      static_cast<const int32_t*>(problem_sizes1.data_ptr()),
-      static_cast<int32_t*>(expert_offsets.data_ptr()),
-      static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
+  if (blockscale_offsets.has_value()) {
+    compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>(
+        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
+  } else {
+    compute_expert_offsets<<<1, 1, 0, stream>>>(
+        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
+  }
   compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
       static_cast<const int32_t*>(topk_ids.data_ptr()),
       static_cast<const int32_t*>(expert_offsets.data_ptr()),
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index e9b408fbf..ee93440b5 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -54,7 +54,8 @@ void get_cutlass_moe_mm_data_caller(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
     torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k);
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets);
 #endif
 
 void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
@@ -224,7 +225,8 @@ void get_cutlass_moe_mm_data(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
     torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k) {
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets) {
   // This function currently gets compiled only if we have a valid cutlass moe
   // mm to run it for.
   int32_t version_num = get_sm_version_num();
@@ -232,7 +234,8 @@ void get_cutlass_moe_mm_data(
     (defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM90)
   get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
                                  problem_sizes2, input_permutation,
-                                 output_permutation, num_experts, n, k);
+                                 output_permutation, num_experts, n, k,
+                                 blockscale_offsets);
   return;
 #endif
   TORCH_CHECK_NOT_IMPLEMENTED(
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 3fffaf290..93916b7f9 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -450,7 +450,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                        Tensor! problem_sizes1, Tensor! problem_sizes2, "
       "                        Tensor! input_permutation, "
       "                        Tensor! output_permutation, int num_experts, "
-      "                        int n, int k) -> ()",
+      "                        int n, int k, Tensor? blockscale_offsets) -> ()",
       {stride_tag});
   ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
 
diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py
index be33200cc..22482d9ca 100644
--- a/tests/kernels/moe/test_nvfp4_moe.py
+++ b/tests/kernels/moe/test_nvfp4_moe.py
@@ -80,7 +80,10 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
                 w2[expert], w2_gs[expert])
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
 
         a1_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
         a2_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 3282edf41..14404cd73 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -845,11 +845,16 @@ def cutlass_scaled_sparse_mm(
     return out
 
 
-def get_cutlass_moe_mm_data(
-        topk_ids: torch.Tensor, expert_offsets: torch.Tensor,
-        problem_sizes1: torch.Tensor, problem_sizes2: torch.Tensor,
-        input_permutation: torch.Tensor, output_permutation: torch.Tensor,
-        num_experts: int, n: int, k: int):
+def get_cutlass_moe_mm_data(topk_ids: torch.Tensor,
+                            expert_offsets: torch.Tensor,
+                            problem_sizes1: torch.Tensor,
+                            problem_sizes2: torch.Tensor,
+                            input_permutation: torch.Tensor,
+                            output_permutation: torch.Tensor,
+                            num_experts: int,
+                            n: int,
+                            k: int,
+                            blockscale_offsets: Optional[torch.Tensor] = None):
     """
     Prepare data necessary to perform CUTLASS grouped matrix multiplications
     used in CUTLASS-based fused MoE.
@@ -867,12 +872,31 @@ def get_cutlass_moe_mm_data(
                          before executing the MMs.
     - output_permutation: Permutation that must be used to shuffle the output
                           after executing the MMs.
+    - blockscale_offsets: Optional argument passed for fp4 moe. Indices that
+                          mark at which block scale index each expert begins
+                          its computation. The number of block scale rows
+                          computed with expert E is blockscale_offsets[E + 1] -
+                          blockscale_offsets[E]
     """
     return torch.ops._C.get_cutlass_moe_mm_data(topk_ids, expert_offsets,
                                                 problem_sizes1, problem_sizes2,
                                                 input_permutation,
                                                 output_permutation,
-                                                num_experts, n, k)
+                                                num_experts, n, k,
+                                                blockscale_offsets)
+
+
+def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor):
+    """
+    Shuffle and expand the input tensor according to the dst2src_map and store the result in output_tensor.
+    This is used in MoE to permute the input tensor before performing grouped matrix multiplications.
+    """
+    num_tokens_permuted = dst2src_map.shape[0]
+    output_tensor = torch.empty((num_tokens_permuted, input_tensor.shape[1]),
+                                device=input_tensor.device,
+                                dtype=input_tensor.dtype)
+    torch.ops._moe_C.shuffle_rows(input_tensor, dst2src_map, output_tensor)
+    return output_tensor
 
 
 def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
@@ -1124,14 +1148,12 @@ def scaled_fp4_experts_quant(
     expert_offsets: torch.Tensor,
     blockscale_offsets: torch.Tensor,
     topk: int,
-    expert_map: Optional[torch.Tensor] = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP4 and return quantized tensor and scale, for
     packed MoE Inputs.
     Args:
-        input: The input tensor to be quantized to FP4
-        expert_map: The expert map tensor
+        input_tensor: The input tensor to be quantized to FP4
         input_global_scale: A scalar scaling factor for the entire tensor.
         expert_offsets: The expert offsets tensor
         blockscale_offsets: The blockscale offsets tensor
@@ -1143,14 +1165,13 @@ def scaled_fp4_experts_quant(
     assert input_tensor.ndim == 2, (
         f'input.ndim needs to be == 2, but got {input_tensor.ndim}.')
 
-    input_tensor = input_tensor[
-        expert_map] if expert_map is not None else input_tensor
-    m_numtopk, k = input_tensor.shape
     # Control the maximum number of tokens per expert supported by the
     # NVFP4 MoE Expert Quantization. This is used to prevent the kernel
     # from running out of memory. This value can also be increased to support
     # larger models.
     MAX_TOKENS_PER_EXPERT = envs.VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE
+    m_numtopk, k = input_tensor.shape
+
     assert (m_numtopk <= MAX_TOKENS_PER_EXPERT * topk), (
         f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT("
         f"{MAX_TOKENS_PER_EXPERT})"
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index d827869d0..e9446bc5f 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -333,6 +333,7 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
     num_topk = topk_ids.shape[1]
 
     expert_offsets = torch.empty((e + 1), dtype=torch.int32, device=device)
+    blockscale_offsets = torch.empty((e + 1), dtype=torch.int32, device=device)
     # Problem size:  (num_experts, (m,2n,k))
     problem_sizes1 = torch.empty((e, 3), dtype=torch.int32, device=device)
     # Problem size:  (num_experts, (m,n,k))
@@ -344,12 +345,10 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
     # problem shapes should have [m, n, k]
     # Note that problem sizes are based on logical number of elements.
     ops.get_cutlass_moe_mm_data(topk_ids, expert_offsets, problem_sizes1,
-                                problem_sizes2, a_map, c_map, e, n, k)
+                                problem_sizes2, a_map, c_map, e, n, k,
+                                blockscale_offsets)
 
-    tokens_per_expert = problem_sizes1[:, 0]
-    rounded_tokens_per_expert = (tokens_per_expert + (128 - 1)) // 128 * 128
-    blockscale_offsets = torch.zeros(e + 1, dtype=torch.int32, device=device)
-    blockscale_offsets[1:] = torch.cumsum(rounded_tokens_per_expert, dim=0)
+    a = ops.shuffle_rows(a, a_map)
 
     rep_a_fp4, rep_a_blockscale = ops.scaled_fp4_experts_quant(
         a,
@@ -357,7 +356,7 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
         expert_offsets,
         blockscale_offsets,
         num_topk,
-        expert_map=a_map)
+    )
 
     c1 = ops.cutlass_fp4_moe_mm(rep_a_fp4, w1_fp4, rep_a_blockscale,
                                 w1_blockscale, w1_alphas, problem_sizes1,
@@ -378,6 +377,8 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
                                 w2_alphas, problem_sizes2, expert_offsets[:-1],
                                 blockscale_offsets[:-1], out_dtype, device)
     del int_fp4, int_blockscale
-    out = (c2[c_map].view(m, num_topk, k) *
+
+    c2 = ops.shuffle_rows(c2, c_map)
+    out = (c2.view(m, num_topk, k) *
            topk_weights.view(m, num_topk, 1).half()).sum(dim=1)
     return out.to(dtype=out_dtype)
-- 
GitLab


From 85e2b7bb1380dd7096e0d6b64b0d7633b0a9db4a Mon Sep 17 00:00:00 2001
From: Povilas Kanapickas <povilas@radix.lt>
Date: Thu, 5 Jun 2025 19:53:08 +0300
Subject: [PATCH 194/274] [MISC][Bugfix] Use less CPU when message queue has
 been empty for some time (#16226)

Signed-off-by: Povilas Kanapickas <povilas@radix.lt>
---
 .../test_basic_correctness.py                 | 28 +++++++----
 .../device_communicators/shm_broadcast.py     | 47 ++++++++++++++++++-
 vllm/envs.py                                  |  6 +++
 3 files changed, 70 insertions(+), 11 deletions(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 46be4a3c3..2e103019f 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -128,15 +128,21 @@ def test_models(
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
     "model, distributed_executor_backend, attention_backend, "
-    "test_suite", [
-        ("distilbert/distilgpt2", "ray", "", "L4"),
-        ("distilbert/distilgpt2", "mp", "", "L4"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
-        ("distilbert/distilgpt2", "ray", "", "A100"),
-        ("distilbert/distilgpt2", "mp", "", "A100"),
-        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
+    "test_suite, extra_env", [
+        ("distilbert/distilgpt2", "ray", "", "L4", {}),
+        ("distilbert/distilgpt2", "mp", "", "L4", {}),
+        ("distilbert/distilgpt2", "ray", "", "L4", {
+            "VLLM_SLEEP_WHEN_IDLE": "1"
+        }),
+        ("distilbert/distilgpt2", "mp", "", "L4", {
+            "VLLM_SLEEP_WHEN_IDLE": "1"
+        }),
+        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
+        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
+        ("distilbert/distilgpt2", "ray", "", "A100", {}),
+        ("distilbert/distilgpt2", "mp", "", "A100", {}),
+        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100", {}),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100", {}),
     ])
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models_distributed(
@@ -148,6 +154,7 @@ def test_models_distributed(
     distributed_executor_backend: str,
     attention_backend: str,
     test_suite: str,
+    extra_env: dict[str, str],
     enable_prompt_embeds: bool,
 ) -> None:
 
@@ -173,6 +180,9 @@ def test_models_distributed(
                 attention_backend,
             )
 
+        for k, v in extra_env.items():
+            monkeypatch_context.setenv(k, v)
+
         dtype = "half"
         max_tokens = 5
 
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 0f66f0aeb..c7810043b 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -28,6 +28,43 @@ VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
 logger = init_logger(__name__)
 
 
+class SpinTimer:
+
+    def record_activity(self):
+        pass
+
+    def spin(self):
+        sched_yield()
+
+
+class SpinSleepTimer(SpinTimer):
+    """
+    In setups which have long inactivity periods it is desirable to reduce
+    system power consumption when vllm does nothing. This would lead to more
+    CPU thermal headroom when a request eventually comes, especially when
+    multiple GPUs are connected as each GPU would otherwise pin one thread at
+    100% CPU usage.
+
+    The simplest solution is to reduce polling frequency when there is no
+    activity for a certain period of time.
+    """
+
+    def __init__(self, busy_loop_s: float = 3.0, wait_sleep_s: float = 0.1):
+        self.last_activity = time.monotonic()
+        self.busy_loop_s = busy_loop_s
+        self.wait_sleep_s = wait_sleep_s
+
+    def record_activity(self):
+        self.last_activity = time.monotonic()
+
+    def spin(self):
+        curr_time = time.monotonic()
+        if curr_time >= self.last_activity + self.busy_loop_s:
+            time.sleep(self.wait_sleep_s)
+        else:
+            sched_yield()
+
+
 class ShmRingBuffer:
 
     def __init__(self,
@@ -42,7 +79,7 @@ class ShmRingBuffer:
         of items that can be stored in the buffer are known in advance.
         In this case, we don't need to synchronize the access to
          the buffer.
-        
+
         Buffer memory layout:
                   data                                 metadata
                     |                                      |
@@ -238,6 +275,7 @@ class MessageQueue:
         self.local_reader_rank = -1
         # rank does not matter for remote readers
         self._is_remote_reader = False
+        self._read_spin_timer = SpinTimer()
 
         self.handle = Handle(
             local_reader_ranks=local_reader_ranks,
@@ -276,6 +314,9 @@ class MessageQueue:
             self.local_socket.connect(socket_addr)
 
             self.remote_socket = None
+
+            self._read_spin_timer = SpinSleepTimer(
+            ) if envs.VLLM_SLEEP_WHEN_IDLE else SpinTimer()
         else:
             self.buffer = None  # type: ignore
             self.current_idx = -1
@@ -407,7 +448,7 @@ class MessageQueue:
                     # we need to wait until it is written
 
                     # Release the processor to other threads
-                    sched_yield()
+                    self._read_spin_timer.spin()
 
                     # if we wait for a long time, log a message
                     if (time.monotonic() - start_time
@@ -438,6 +479,8 @@ class MessageQueue:
                 metadata_buffer[self.local_reader_rank + 1] = 1
                 self.current_idx = (self.current_idx +
                                     1) % self.buffer.max_chunks
+
+                self._read_spin_timer.record_activity()
                 break
 
     def enqueue(self, obj, timeout: Optional[float] = None):
diff --git a/vllm/envs.py b/vllm/envs.py
index 08bf2dad4..ffb630079 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -122,6 +122,7 @@ if TYPE_CHECKING:
     VLLM_ALL2ALL_BACKEND: str = "naive"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
     VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
+    VLLM_SLEEP_WHEN_IDLE: bool = False
 
 
 def get_default_cache_root():
@@ -841,6 +842,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Regex timeout for use by the vLLM tool parsing plugins.
     "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS":
     lambda: int(os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")),
+
+    # Reduce CPU usage when vLLM is idle. Enabling this will incur small
+    # latency penalty when a request eventually comes.
+    "VLLM_SLEEP_WHEN_IDLE":
+    lambda: bool(int(os.getenv("VLLM_SLEEP_WHEN_IDLE", "0"))),
 }
 
 # --8<-- [end:env-vars-definition]
-- 
GitLab


From 9ef9173cfa338ea7742f23ee9d0038e580081390 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 5 Jun 2025 19:10:15 +0200
Subject: [PATCH 195/274] [P/D][NixlConnector] Enable FlashInfer backend
 (#19090)

---
 .../kv_connector/v1/nixl_connector.py         | 65 ++++++++++++++-----
 vllm/platforms/interface.py                   |  1 +
 2 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index f59970397..7552fc889 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -15,6 +15,7 @@ import torch
 import zmq
 
 from vllm import envs
+from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
@@ -22,6 +23,7 @@ from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size,
     get_tp_group)
 from vllm.logger import init_logger
+from vllm.platforms import _Backend
 from vllm.utils import make_zmq_path, make_zmq_socket, round_down
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.request import RequestStatus
@@ -57,6 +59,7 @@ class NixlAgentMetadata(
     num_blocks: int
     tp_size: int
     block_len: int
+    attn_backend_name: str
 
 
 @dataclass
@@ -384,11 +387,25 @@ class NixlConnectorWorker:
 
         self.vllm_config = vllm_config
         self.block_size = vllm_config.cache_config.block_size
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
 
         # TODO(mgoin): remove this once we have hybrid memory allocator
         # Optimization for models with local attention (Llama 4)
         # List of block window sizes for each layer for local attention
         self.block_window_per_layer: list[Optional[int]] = []
+        self.use_mla = self.model_config.use_mla
+
+        backend = get_attn_backend(self.model_config.get_head_size(),
+                                   self.model_config.dtype,
+                                   self.cache_config.cache_dtype,
+                                   self.block_size,
+                                   self.model_config.is_attention_free,
+                                   use_mla=self.use_mla)
+        self.backend_name = backend.get_name()
+        attn_backend = backend_name_to_enum(self.backend_name)
+        self._use_flashinfer = attn_backend == _Backend.FLASHINFER_VLLM_V1
+        logger.debug("Detected attention backend %s", self.backend_name)
 
         self._tp_size: dict[str, int] = {self.engine_id: self.world_size}
         # With heterogeneous TP, P must wait for all assigned D TP workers to
@@ -472,12 +489,16 @@ class NixlConnectorWorker:
         kv_elem_size = first_kv_cache.element_size()
 
         # TODO(tms): Find a more robust way to detect and handle MLA
-        self.use_mla = len(first_kv_cache.shape) == 3
         # NOTE (NickLucche) To move blocks efficiently with NIXL, the expected
         # KV memory layout is HND, as opposed to the default NHD. Note that it
         # will only affects the strides. For MLA instead, we make require no
         # such thing and resort to the standard layout.
-        if self.use_mla:
+        use_mla = len(first_kv_cache.shape) == 3
+        assert use_mla == self.use_mla
+
+        # TODO (NickLucche) not compatible with hybrid allocator. Enforce check
+        # once it goes live, as a single kv layout is expected for xfers.
+        if use_mla:
             # MLA case.
             self.num_blocks = first_kv_cache.shape[0]
             block_rank = 2  # [block_size, latent_dim]
@@ -485,11 +506,16 @@ class NixlConnectorWorker:
             block_size, kv_latent_dim = block_shape
             self.slot_size_bytes = kv_elem_size * kv_latent_dim
         else:
-            # [2 (k and v), num_blocks, block_size, kv_heads, head_dim]
-            self.num_blocks = first_kv_cache.shape[1]
-            block_rank = 3  # [block_size, kv_heads, head_dim]
+            # [2 (k and v), num_blocks, ...]
+            if self._use_flashinfer:
+                # FlashInfer swaps 2<->num_blocks dimensions.
+                self.num_blocks = first_kv_cache.shape[0]
+                block_rank = 4  # [2, block_size, kv_heads, head_dim]
+            else:
+                self.num_blocks = first_kv_cache.shape[1]
+                block_rank = 3  # [block_size, kv_heads, head_dim]
             block_shape = first_kv_cache.shape[-block_rank:]
-            block_size, n_kv_heads, head_dim = block_shape
+            block_size, n_kv_heads, head_dim = block_shape[-3:]
             # head size in bytes.
             self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
         assert block_size == self.block_size
@@ -497,12 +523,10 @@ class NixlConnectorWorker:
         # hybrid attn, etc
         # block size in bytes
         self.block_len = kv_elem_size * math.prod(block_shape)
-
-        logger.debug("Registering KV_Caches. use_mla: %s, shape %s",
-                     self.use_mla, first_kv_cache.shape)
-        logger.debug("num_blocks: %s, block_shape: %s", self.num_blocks,
-                     block_shape)
-        logger.debug("Per layer kv cache size: %s", first_kv_cache.shape)
+        logger.info(
+            "Registering KV_Caches: use_mla: %s, num_blocks: %s, "
+            "block_shape: %s, per_layer_kv_cache_shape: %s", use_mla,
+            self.num_blocks, block_shape, first_kv_cache.shape)
         self.dst_num_blocks[self.engine_id] = self.num_blocks
         self.kv_caches = kv_caches
         kv_caches_base_addr = []
@@ -514,9 +538,12 @@ class NixlConnectorWorker:
         # are non-contiguous (it's not locally guaranteed that they will be)
         # Disadvantage is that the encoded NixlAgentMetadata is now larger
         # (roughly 8KB vs 5KB).
+        # Conversely for FlashInfer, K and V are transferred in the same tensor
+        # to better exploit the memory layout (ie num_blocks is the first dim).
         for cache_or_caches in kv_caches.values():
             # Normalize to always be a list of caches
-            cache_list = [cache_or_caches] if self.use_mla else cache_or_caches
+            cache_list = [cache_or_caches] if use_mla or self._use_flashinfer \
+                else cache_or_caches
             for cache in cache_list:
                 base_addr = cache.data_ptr()
                 region_len = self.num_blocks * self.block_len
@@ -581,7 +608,8 @@ class NixlConnectorWorker:
             kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
             num_blocks=self.num_blocks,
             tp_size=self.world_size,
-            block_len=self.block_len)
+            block_len=self.block_len,
+            attn_backend_name=self.backend_name)
         ready_event = threading.Event()
         self._nixl_handshake_listener_t = threading.Thread(
             target=self._nixl_handshake_listener,
@@ -641,6 +669,10 @@ class NixlConnectorWorker:
             assert self._tp_size[engine_id] == nixl_agent_meta.tp_size
         else:
             self._tp_size[engine_id] = nixl_agent_meta.tp_size
+        # We may eventually enable this after asserting equality in cache
+        # layout and close outputs.
+        assert nixl_agent_meta.attn_backend_name == self.backend_name
+
         self._remote_agents[engine_id][
             remote_tp_rank] = self.nixl_wrapper.add_remote_agent(
                 nixl_agent_meta.agent_metadata)
@@ -659,13 +691,16 @@ class NixlConnectorWorker:
         else:
             remote_block_size = nixl_agent_meta.block_len // (
                 self.slot_size_bytes * tp_ratio)
+            if self._use_flashinfer:
+                # Account for joint KV in FlashInfer.
+                remote_block_size //= 2
 
             assert nixl_agent_meta.block_len == self.block_len * tp_ratio, (
                 "Remote P worker KV layer cache must be of shape [2, N, "
                 "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
             )
 
-        assert self.block_size == remote_block_size, "Remote P worker with "
+        assert self.block_size == remote_block_size, "Remote P worker with " \
         "different block size is not supported"
 
         assert self.num_blocks >= nixl_agent_meta.num_blocks
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 7fef697d8..9a97b21d8 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -47,6 +47,7 @@ class _Backend(enum.Enum):
     ROCM_AITER_MLA_VLLM_V1 = enum.auto()
     TORCH_SDPA = enum.auto()
     FLASHINFER = enum.auto()
+    FLASHINFER_VLLM_V1 = enum.auto()
     TRITON_MLA = enum.auto()  # Supported by V1
     TRITON_MLA_VLLM_V1 = enum.auto()
     FLASHMLA_VLLM_V1 = enum.auto()
-- 
GitLab


From aa49f148322a39727be110da51a6782d43a2f5d8 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Thu, 5 Jun 2025 14:21:53 -0400
Subject: [PATCH 196/274] [Quantization] Skip Fp4 Test for `compressed-tensors`
 (#19217)

---
 tests/quantization/test_compressed_tensors.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 807b24d4e..03480343d 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -651,6 +651,7 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
         assert output
 
 
+@pytest.mark.skip(reason="Skip until the model config is updated")
 def test_compressed_tensors_nvfp4a16(vllm_runner):
     # run weight only example
     model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP4"
-- 
GitLab


From 87360308b7ee9b0e529ba9146c93a9ef5526b968 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 5 Jun 2025 15:40:39 -0400
Subject: [PATCH 197/274] [V1] Use FlashInfer by default on Blackwell GPUs
 (#19118)

---
 vllm/platforms/cuda.py      | 15 +++++++++++++++
 vllm/platforms/interface.py | 24 ++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index bde606f0c..8bbda94e5 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -229,6 +229,21 @@ class CudaPlatformBase(Platform):
                 logger.info_once("Using Triton backend on V1 engine.")
                 return ("vllm.v1.attention.backends."
                         "triton_attn.TritonAttentionBackend")
+            if cls.is_device_capability(100):
+                # Prefer FlashInfer for V1 on Blackwell GPUs if installed
+                try:
+                    import flashinfer  # noqa: F401
+                    logger.info_once(
+                        "Using FlashInfer backend on V1 engine by default for "
+                        "Blackwell (SM 10.0) GPUs.")
+                    return ("vllm.v1.attention.backends."
+                            "flashinfer.FlashInferBackend")
+                except ImportError:
+                    logger.info_once(
+                        "FlashInfer failed to import for V1 engine on "
+                        "Blackwell (SM 10.0) GPUs; it is recommended to "
+                        "install FlashInfer for better performance.")
+                    pass
             if cls.has_device_capability(80):
                 logger.info_once("Using Flash Attention backend on V1 engine.")
                 return ("vllm.v1.attention.backends."
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 9a97b21d8..08d7aa175 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -228,6 +228,30 @@ class Platform:
 
         return current_capability.to_int() >= capability
 
+    @classmethod
+    def is_device_capability(
+        cls,
+        capability: Union[tuple[int, int], int],
+        device_id: int = 0,
+    ) -> bool:
+        """
+        Test whether this platform has exactly the specified device capability.
+
+        The `capability` argument can either be:
+
+        - A tuple `(major, minor)`.
+        - An integer `<major><minor>`. (See
+        [`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int])
+        """
+        current_capability = cls.get_device_capability(device_id=device_id)
+        if current_capability is None:
+            return False
+
+        if isinstance(capability, tuple):
+            return current_capability == capability
+
+        return current_capability.to_int() == capability
+
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
         """Get the name of a device."""
-- 
GitLab


From cb6d572e85a34aec7b4409833bff12af28b0d28b Mon Sep 17 00:00:00 2001
From: Luis Vega <vegaluisjose@users.noreply.github.com>
Date: Thu, 5 Jun 2025 14:29:28 -0700
Subject: [PATCH 198/274] [Model] NemotronH support (#18863)

Signed-off-by: Luis Vega <2478335+vegaluisjose@users.noreply.github.com>
Co-authored-by: Luis Vega <2478335+vegaluisjose@users.noreply.github.com>
---
 docs/models/supported_models.md               |   1 +
 tests/models/registry.py                      |   2 +
 vllm/model_executor/models/nemotron_h.py      | 565 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 vllm/transformers_utils/configs/__init__.py   |   2 +
 vllm/transformers_utils/configs/nemotron_h.py | 258 ++++++++
 6 files changed, 829 insertions(+)
 create mode 100644 vllm/model_executor/models/nemotron_h.py
 create mode 100644 vllm/transformers_utils/configs/nemotron_h.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 71414d2aa..a8a6f3417 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -346,6 +346,7 @@ Specified using `--task generate`.
 | `MixtralForCausalLM`                              | Mixtral-8x7B, Mixtral-8x7B-Instruct                 | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.                                                          | ✅︎                     | ✅︎                          |
 | `MPTForCausalLM`                                  | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter        | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.                                                                                                   |                        | ✅︎                          |
 | `NemotronForCausalLM`                             | Nemotron-3, Nemotron-4, Minitron                    | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.                                                                                                         | ✅︎                     | ✅︎                          |
+| `NemotronHForCausalLM`                            | Nemotron-H                                          | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc.                                                                       | ✅︎                     | ✅︎                          |
 | `OLMoForCausalLM`                                 | OLMo                                                | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.                                                                                                                             |                        | ✅︎                          |
 | `OLMo2ForCausalLM`                                | OLMo2                                               | `allenai/OLMo-2-0425-1B`, etc.                                                                                                                                               |                        | ✅︎                          |
 | `OLMoEForCausalLM`                                | OLMoE                                               | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.                                                                                                        |                        | ✅︎                          |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 3e07dc0f3..e6543c197 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -212,6 +212,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
     "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
     "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
+    "NemotronHForCausalLM": _HfExamplesInfo("nvidia/Nemotron-H-8B-Base-8K",
+                                            trust_remote_code=True),
     "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
     "Olmo2ForCausalLM": _HfExamplesInfo("allenai/OLMo-2-0425-1B"),
     "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
new file mode 100644
index 000000000..2ef8d3115
--- /dev/null
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -0,0 +1,565 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from https://github.com/vllm-project/vllm/blob/94d8ec8d2bcb4ec55e33022b313c7e978edf05e1/vllm/model_executor/models/bamba.py
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only NemotronH model."""
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.activation import ReLUSquaredActivation
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
+from vllm.model_executor.layers.mamba.mamba_mixer2 import (
+    MambaMixer2, extra_groups_for_head_shards)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
+                                                   SupportsLoRA, SupportsPP,
+                                                   SupportsQuant,
+                                                   SupportsV0Only)
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader, make_empty_intermediate_tensors_factory, make_layers,
+    maybe_prefix)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import NemotronHConfig
+from vllm.utils import LayerBlockType
+
+
+class NemotronHMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.up_proj = MergedColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_sizes=[config.intermediate_size],
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.act_fn = ReLUSquaredActivation()
+
+    def forward(self, x: torch.Tensor):
+        x, _ = self.up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class NemotronHMLPDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.mixer = NemotronHMLP(config,
+                                  quant_config=quant_config,
+                                  bias=config.mlp_bias)
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer(hidden_states)
+        return hidden_states, residual
+
+
+class NemotronHMambaDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.mixer = MambaMixer2(
+            hidden_size=config.hidden_size,
+            ssm_state_size=config.ssm_state_size,
+            conv_kernel_size=config.conv_kernel,
+            intermediate_size=config.expand * config.hidden_size,
+            use_conv_bias=config.use_conv_bias,
+            use_bias=config.use_bias,
+            n_groups=config.n_groups,
+            num_heads=config.mamba_num_heads,
+            head_dim=config.mamba_head_dim,
+            rms_norm_eps=config.rms_norm_eps,
+            activation=config.mamba_hidden_act,
+            quant_config=quant_config,
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer(hidden_states, mamba_cache_params,
+                                   mamba2_metadata)
+        return hidden_states, residual
+
+
+class NemotronHAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class NemotronHAttentionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.mixer = NemotronHAttention(
+            config,
+            layer_idx,
+            cache_config,
+            quant_config,
+            prefix,
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer(hidden_states=hidden_states)
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "M": NemotronHMambaDecoderLayer,
+    "-": NemotronHMLPDecoderLayer,
+    "*": NemotronHAttentionDecoderLayer,
+}
+
+
+class NemotronHModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config: NemotronHConfig = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            layer_class = ALL_DECODER_LAYER_TYPES[
+                config.hybrid_override_pattern[layer_idx]]
+            return layer_class(
+                config,
+                layer_idx,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            len(config.hybrid_override_pattern),
+            get_layer,
+            prefix=f"{prefix}.layers")
+        self.make_empty_intmd_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size)
+
+        self.norm_f = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        attn_metadata = get_forward_context().attn_metadata
+
+        mamba2_metadata = prepare_mamba2_metadata(
+            chunk_size=self.config.chunk_size,
+            attn_metadata=attn_metadata,
+        )
+
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        residual = None
+        num_non_mamba_layers = 0
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            layer_mamba_cache_params = None
+            if isinstance(layer, NemotronHMambaDecoderLayer):
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
+                    i - num_non_mamba_layers)
+            else:
+                num_non_mamba_layers += 1
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                mamba_cache_params=layer_mamba_cache_params,
+                mamba2_metadata=mamba2_metadata,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm_f(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        attb_params_mapping = {
+            "q_proj": "q",
+            "k_proj": "k",
+            "v_proj": "v",
+        }
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "embeddings" in name:
+                name = name.replace("embeddings", "embed_tokens")
+
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+                loaded_weight = loaded_weight.to(torch.float32)
+
+            if "D" in name:
+                loaded_weight = loaded_weight.to(torch.float32)
+
+            if "dt_bias" in name:
+                loaded_weight = loaded_weight.to(torch.float32)
+
+            # load attn params
+            if any(proj in name for proj in ["q_proj", "k_proj", "v_proj"]):
+                weight_name = next(proj
+                                   for proj in ["q_proj", "k_proj", "v_proj"]
+                                   if proj in name)
+                name = name.replace(weight_name, "qkv_proj")
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight,
+                              attb_params_mapping[weight_name])
+            # load other params
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+            loaded_params.add(name)
+        return loaded_params
+
+
+class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
+                           IsHybrid, SupportsV0Only, SupportsQuant):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": ["up_proj", "down_proj"]
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
+        assert not cache_config.enable_prefix_caching, \
+            "NemotronH currently does not support prefix caching"
+
+        self.quant_config = vllm_config.quant_config
+
+        super().__init__()
+        self.config = config
+        self.scheduler_config = scheduler_config
+        self.model = NemotronHModel(vllm_config=vllm_config,
+                                    prefix=maybe_prefix(prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+
+        self.make_empty_intmd_tensors = (self.model.make_empty_intmd_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs):
+        if self.mamba_cache is None:
+
+            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
+                self.vllm_config.parallel_config, LayerBlockType.mamba)
+
+            self.mamba_cache = MambaCacheManager(
+                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                *self._get_mamba_cache_shape())
+        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+        hidden_states = self.model(input_ids, positions, mamba_cache_params,
+                                   intermediate_tensors, inputs_embeds)
+
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def _get_mamba_cache_shape(
+            self) -> tuple[tuple[int, int], tuple[int, int]]:
+        world_size = get_tensor_model_parallel_world_size()
+        hidden_size = self.config.hidden_size
+
+        conv_state_shape, temporal_state_shape = None, None
+
+        intermediate_size = self.config.expand * hidden_size
+
+        # if n_groups is not divisible by world_size, need to extend the shards
+        # to ensure all groups needed by a head is sharded along with it
+        n_groups = (
+            self.config.n_groups +
+            extra_groups_for_head_shards(self.config.n_groups, world_size))
+
+        # - heads and n_groups are TP-ed
+        conv_dim = (intermediate_size +
+                    2 * n_groups * self.config.ssm_state_size)
+        conv_state_shape = (
+            divide(conv_dim, world_size),
+            self.config.conv_kernel - 1,
+        )
+
+        # These are not TP-ed as they depend on A, dt_bias, D
+        # - they are typically small
+        #   e.g., (h_heads, d_head, d_state) = (128, 64, 128)
+        temporal_state_shape = (
+            divide(self.config.mamba_num_heads, world_size),
+            self.config.mamba_head_dim,
+            self.config.ssm_state_size,
+        )
+        return conv_state_shape, temporal_state_shape
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        # update name in weights before passing to loader
+        updated_weights = []
+        for name, loaded_weight in weights:
+            name = name.replace("backbone", "model")
+            updated_weights.append((name, loaded_weight))
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(updated_weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 57d1b7c53..e82e36638 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -92,6 +92,7 @@ _TEXT_GENERATION_MODELS = {
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "MiMoForCausalLM": ("mimo", "MiMoForCausalLM"),
     "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
+    "NemotronHForCausalLM": ("nemotron_h", "NemotronHForCausalLM"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
     "Olmo2ForCausalLM": ("olmo2", "Olmo2ForCausalLM"),
     "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"),
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 7edff455f..97a1b683a 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -23,6 +23,7 @@ from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
+from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
 from vllm.transformers_utils.configs.ovis import OvisConfig
 from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
@@ -50,6 +51,7 @@ __all__ = [
     "MoonViTConfig",
     "KimiVLConfig",
     "NemotronConfig",
+    "NemotronHConfig",
     "NVLM_D_Config",
     "OvisConfig",
     "SkyworkR1VChatConfig",
diff --git a/vllm/transformers_utils/configs/nemotron_h.py b/vllm/transformers_utils/configs/nemotron_h.py
new file mode 100644
index 000000000..9fe75f2df
--- /dev/null
+++ b/vllm/transformers_utils/configs/nemotron_h.py
@@ -0,0 +1,258 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""NemotronH model configuration"""
+
+import regex as re
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class NemotronHConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`NemotronHModel`]. It is used to instantiate a NemotronH model according
+    to the specified arguments, defining the model architecture. Instantiating
+    a configuration with the defaults will yield a similar configuration to
+    that of the NemotronH-v0.1 model.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 131072):
+            Vocabulary size of the NemotronH model. Defines the number of
+            different tokens that can be represented by the `inputs_ids`
+            passed when calling [`NemotronHModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be
+            tied. Note that this is only relevant if the model has a output
+            word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 21504):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 52):
+            Number of hidden layers in the Transformer encoder.
+        hybrid_override_pattern (`str`, *optional*, defaults to
+            `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
+            The pattern of the hybrid model. The pattern is a string of
+            characters where each character represents
+            M: Mamba2, *: Attention, -: MLP
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the
+            Transformer encoder.
+        attention_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each attention head.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use
+            Multi Head Attention (MHA), if `num_key_value_heads=1` the model
+            will use Multi Query Attention (MQA) otherwise GQA is used.
+        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
+            The non-linear activation function in the MLP layers.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in attention layers.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in MLP layers.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the model.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
+            Whether or not residuals should be in `float32`. If set to `False`
+            residuals will keep the same `dtype` as the rest of the model.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`,
+            all logits will be calculated. If an integer value, only last
+            `num_logits_to_keep` logits will be calculated.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        sliding_window (`int`, *optional*, defaults to None):
+            Sliding window attention window size.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used
+            with.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the hidden states.
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels.
+            These are available only if `mamba-ssm` and `causal-conv1d`
+            are installed, and the mamba modules are running on a CUDA device.
+        ssm_state_size (`int`, *optional*, defaults to 128):
+            The dimension of the mamba state space latents.
+        mamba_num_heads (`int`, *optional*, defaults to 128):
+            Number of heads in Mamba layers.
+        mamba_n_groups (`int`, *optional*, defaults to 8):
+            Number of groups in Mamba layers.
+        mamba_head_dim (`int`, *optional*, defaults to 64):
+            Dimension of each Mamba head.
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel.
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor used to determine the mamba intermediate size.
+        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
+            The non-linear activation function in the Mamba layers.
+        mamba_dt_min (`float`, *optional*, defaults to 0.001):
+            Minimum value for the time step in Mamba.
+        mamba_dt_max (`float`, *optional*, defaults to 0.1):
+            Maximum value for the time step in Mamba.
+        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
+            Limits for the time step in Mamba.
+        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
+            Floor value for time step initialization in Mamba.
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the convolution layer of the mamba mixer
+            block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the input and output projections of the
+            mamba mixer block.
+        mamba_chunk_size (`int`, *optional*, defaults to 256):
+            Size of chunks for Mamba processing.
+        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the pre-normalization residual connections.
+    """
+
+    model_type = "nemotron_h"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=131072,
+        tie_word_embeddings=False,
+        hidden_size=4096,
+        intermediate_size=21504,
+        num_hidden_layers=52,
+        hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
+        num_attention_heads=32,
+        attention_head_dim=128,
+        num_key_value_heads=8,  # nemo: num_query_groups
+        mlp_hidden_act="relu2",
+        attention_bias=False,
+        mlp_bias=False,
+        use_bias=False,
+        initializer_range=0.02,  # nemo: init_method_std
+        layer_norm_epsilon=1e-5,  # nemo: layernorm_epsilon
+        residual_in_fp32=False,  #  Megatron Core default value
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        sliding_window=None,
+        max_position_embeddings=4096,
+        attention_dropout=0.0,
+        hidden_dropout=0.0,  # * ADDED
+        use_mamba_kernels=True,
+        ssm_state_size=128,  # mamba_state_size
+        mamba_num_heads=128,
+        mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
+        mamba_head_dim=64,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_hidden_act="silu",
+        mamba_dt_min=0.001,
+        mamba_dt_max=0.1,
+        mamba_dt_limit=(0.0, float("inf")),
+        mamba_dt_init_floor=1e-4,
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        mamba_chunk_size=256,
+        rescale_prenorm_residual=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.hybrid_override_pattern = hybrid_override_pattern
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.sliding_window = sliding_window
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+        self.hidden_dropout = hidden_dropout
+
+        # Validate hybrid_override_pattern
+        # M: Mamba2, *: Attention, -: MLP
+        assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
+            "hybrid_override_pattern must have same length as "
+            "num_hidden_layers")
+        assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
+            "hybrid_override_pattern must only contain characters "
+            "'M', '*', or '-'")
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.mlp_hidden_act = mlp_hidden_act
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
+        self.use_bias = use_bias
+        self.initializer_range = initializer_range
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.residual_in_fp32 = residual_in_fp32
+
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+
+        self.use_mamba_kernels = use_mamba_kernels
+        self.n_groups = mamba_n_groups
+        self.mamba_head_dim = mamba_head_dim
+        self.ssm_state_size = ssm_state_size
+        self.mamba_num_heads = mamba_num_heads
+        self.conv_kernel = mamba_d_conv
+        self.expand = mamba_expand
+        self.mamba_hidden_act = mamba_hidden_act
+        self.time_step_min = mamba_dt_min
+        self.time_step_max = mamba_dt_max
+        self.time_step_limit = mamba_dt_limit
+        self.time_step_floor = mamba_dt_init_floor
+        self.use_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+        self.chunk_size = mamba_chunk_size
+        self.rescale_prenorm_residual = rescale_prenorm_residual
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def layers_block_type(self):
+        return [
+            "mamba" if self.hybrid_override_pattern[i] == "M" else
+            "attention" if self.hybrid_override_pattern[i] == "*" else "mlp"
+            for i in range(self.num_hidden_layers)
+        ]
-- 
GitLab


From c8134bea15826876e37694834ad87d9c4bdfb26b Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Thu, 5 Jun 2025 21:51:32 -0400
Subject: [PATCH 199/274] Fix AOPerModuleConfig name changes (#18869)

Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  3 +++
 tests/quantization/test_torchao.py            |  6 +++---
 .../layers/quantization/torchao.py            | 21 +++++++++++++++++--
 3 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 4ee6b499b..b739851cb 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -424,6 +424,9 @@ steps:
   - vllm/model_executor/layers/quantization
   - tests/quantization
   commands:
+  # temporary install here since we need nightly, will move to requirements/test.in
+  # after torchao 0.12 release
+  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 
 - label: LM Eval Small Models # 53min
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
index c966dc9b8..54ec59585 100644
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -13,7 +13,7 @@ TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None
 
 @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
 def test_pre_quantized_model(vllm_runner):
-    with vllm_runner("drisspg/float8_dynamic_act_float8_weight-opt-125m",
+    with vllm_runner("drisspg/fp8-opt-125m",
                      quantization="torchao",
                      dtype="bfloat16",
                      enforce_eager=True) as llm:
@@ -30,10 +30,10 @@ def test_pre_quantized_model(vllm_runner):
         "cuda:0",
         # {"": "cuda"},
     ])
-def test_opt_125m_int4wo_model_loading_with_params(vllm_runner,
+def test_opt_125m_int8wo_model_loading_with_params(vllm_runner,
                                                    pt_load_map_location):
     torch._dynamo.reset()
-    model_name = "jerryzh168/opt-125m-int4wo"
+    model_name = "jerryzh168/opt-125m-int8wo-partial-quant"
     with vllm_runner(model_name=model_name,
                      quantization="torchao",
                      dtype="bfloat16",
diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py
index af362f7a7..a7d933203 100644
--- a/vllm/model_executor/layers/quantization/torchao.py
+++ b/vllm/model_executor/layers/quantization/torchao.py
@@ -6,6 +6,7 @@ import torch
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -13,12 +14,28 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
 
+logger = init_logger(__name__)
+
 
 class TorchAOConfig(QuantizationConfig):
     """Config class for torchao."""
 
     def __init__(self, torchao_config) -> None:
         self.torchao_config = torchao_config
+        """
+        # TorchAO quantization relies on tensor subclasses. In order,
+        # to enable proper caching this needs standalone compile
+        if is_torch_equal_or_newer("2.8.0"):
+            os.environ["VLLM_TEST_STANDALONE_COMPILE"] = "1"
+            logger.info(
+                "Using TorchAO: Setting VLLM_TEST_STANDALONE_COMPILE=1")
+
+        # TODO: remove after the torch dependency is updated to 2.8
+        if is_torch_equal_or_newer(
+                "2.7.0") and not is_torch_equal_or_newer("2.8.0"):
+            os.environ["VLLM_DISABLE_COMPILE_CACHE"] = "1"
+            logger.info("Using TorchAO: Setting VLLM_DISABLE_COMPILE_CACHE=1")
+        """
 
     def __repr__(self) -> str:
         return f"TorchAOConfig({self.torchao_config})"
@@ -61,10 +78,10 @@ class TorchAOConfig(QuantizationConfig):
         if not isinstance(layer, LinearBase):
             return None
 
-        from torchao.quantization import AOPerModuleConfig
+        from torchao.quantization import ModuleFqnToConfig
 
         module_fqn = prefix
-        if isinstance(self.torchao_config, AOPerModuleConfig):
+        if isinstance(self.torchao_config, ModuleFqnToConfig):
             module_fqn_to_config = self.torchao_config.module_fqn_to_config
             c = module_fqn_to_config.get(
                 module_fqn) or module_fqn_to_config.get("_default", None)
-- 
GitLab


From 3465b87ef8b36d91d8746f29076270d283a5251a Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <benjamin.chislett@centml.ai>
Date: Thu, 5 Jun 2025 22:10:08 -0400
Subject: [PATCH 200/274] [Bugfix] Fix EAGLE vocab embedding construction for
 Llama 70B (#19033)

Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
---
 benchmarks/kernels/bench_fp8_gemm.py       |  2 +-
 tests/v1/spec_decode/test_eagle.py         | 64 ++++++++++++++--------
 vllm/model_executor/models/llama_eagle.py  | 14 ++---
 vllm/model_executor/models/llama_eagle3.py | 24 ++++----
 vllm/platforms/cuda.py                     |  1 +
 vllm/utils.py                              |  1 +
 vllm/v1/spec_decode/eagle.py               | 11 ++--
 7 files changed, 70 insertions(+), 47 deletions(-)

diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py
index 640a33419..b964ed242 100644
--- a/benchmarks/kernels/bench_fp8_gemm.py
+++ b/benchmarks/kernels/bench_fp8_gemm.py
@@ -5,11 +5,11 @@ import copy
 import itertools
 
 import torch
-import triton
 from weight_shapes import WEIGHT_SHAPES
 
 from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
 from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
+from vllm.triton_utils import triton
 
 
 @triton.testing.perf_report(
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index eff8eff43..c93b7f57c 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -9,6 +9,7 @@ import torch
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, SpeculativeConfig,
                          VllmConfig)
+from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.v1.spec_decode.eagle import EagleProposer
 
 model_dir = "meta-llama/Llama-3.1-8B-Instruct"
@@ -113,21 +114,26 @@ def test_prepare_inputs():
     assert torch.equal(token_indices, expected_token_indices)
 
 
-@pytest.mark.parametrize(
-    "method,proposer_helper,draft_model_dir,target_attribute_path", [
-        ("eagle", lambda k: _create_proposer("eagle", k), eagle_dir,
-         ('lm_head', )),
-        ("eagle3", lambda k: _create_proposer("eagle3", k), eagle3_dir,
-         ('model', 'embed_tokens')),
-    ])
+@pytest.mark.parametrize("method,proposer_helper", [
+    ("eagle", lambda k: _create_proposer("eagle", k)),
+    ("eagle3", lambda k: _create_proposer("eagle3", k)),
+])
+@pytest.mark.parametrize("pp_size", [1, 2])
+@pytest.mark.parametrize("use_distinct_embed_tokens", [True, False])
 @mock.patch('vllm.v1.spec_decode.eagle.get_pp_group')
 @mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config')
 @mock.patch('vllm.v1.spec_decode.eagle.get_model')
 def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
-                    proposer_helper, draft_model_dir, target_attribute_path):
-
-    # Setup model mock
+                    proposer_helper, pp_size, use_distinct_embed_tokens):
+    # Setup draft model mock
     mock_model = mock.MagicMock()
+    if use_distinct_embed_tokens:
+        # Some models can have a different hidden size than the target model,
+        # so we test that their embed_tokens doesn't get overwritten
+        mock_model.model.embed_tokens.weight.shape = (131072, 2048)
+    else:
+        mock_model.model.embed_tokens.weight.shape = (131072, 4096)
+
     mock_get_model.return_value = mock_model
 
     # Setup mocks for attention layers
@@ -145,22 +151,24 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
 
     # Setup mock for pp group to return the appropriate value for world size
     mock_pp_group = mock.MagicMock()
-    mock_pp_group.world_size = 2 if method == "eagle" else 1
+    mock_pp_group.world_size = pp_size
     mock_get_pp_group.return_value = mock_pp_group
 
-    # Setup target model with the appropriate attributes
-    target_model = mock.MagicMock()
+    # Setup the target model mock with a custom class so that
+    # isinstance() checks match the expected type.
+    class _TargetModelStub(LlamaForCausalLM):
+        model: mock.MagicMock
+        lm_head: mock.MagicMock
 
-    # Create the necessary attributes on the target model
-    current_obj = target_model
-    for i, attr in enumerate(target_attribute_path):
-        if i == len(target_attribute_path) - 1:
-            # Set the last attribute in the path to a MagicMock
-            setattr(current_obj, attr, mock.MagicMock())
-        else:
-            # Create intermediate objects if needed
-            setattr(current_obj, attr, mock.MagicMock())
-            current_obj = getattr(current_obj, attr)
+    target_model = mock.create_autospec(_TargetModelStub, instance=True)
+    target_model.model = mock.MagicMock()
+    target_model.model.embed_tokens.weight.shape = (131072, 4096)
+
+    from vllm.model_executor.models import SupportsMultiModal
+    assert not isinstance(target_model, SupportsMultiModal)
+
+    if method == "eagle":
+        target_model.lm_head = mock.MagicMock()
 
     # Create proposer using the helper function
     proposer = proposer_helper(k=8)
@@ -171,10 +179,18 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
     # Verify common interactions
     mock_get_model.assert_called_once()
 
-    # Verify the specific attribute sharing based on the method
+    # Verify that EAGLE models gain the lm head from the target model
     if method == "eagle":
         assert proposer.model.lm_head == target_model.lm_head
+
+    # Verify that the embed tokens are set correctly
+    # If pp_size is > 1, the embed tokens should be distinct
+    if pp_size > 1 or use_distinct_embed_tokens:
+        assert proposer.model.model.embed_tokens != \
+            target_model.model.embed_tokens
     else:
+        # When pp_size is 1 and the draft and target models have
+        # embed_tokens of the same shape, they should be shared.
         assert proposer.model.model.embed_tokens == \
             target_model.model.embed_tokens
 
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index f73b863fe..c7690604c 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -55,13 +55,11 @@ class LlamaModel(nn.Module):
             speculative_config.draft_model_config.hf_config
         self.vocab_size = self.config.vocab_size
 
-        # if PP disabled then draft will share embed with target
-        if get_pp_group().world_size > 1:
-            self.embed_tokens = VocabParallelEmbedding(
-                self.config.vocab_size,
-                self.config.hidden_size,
-                prefix=maybe_prefix(prefix, "embed_tokens"),
-            )
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+        )
 
         self.layers = nn.ModuleList([
             LlamaDecoderLayer(
@@ -164,4 +162,4 @@ class EagleLlamaForCausalLM(LlamaForCausalLM):
             if "lm_head" not in name:
                 name = "model." + name
             model_weights[name] = loaded_weight
-        return loader.load_weights(model_weights.items())
+        loader.load_weights(model_weights.items())
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index d31a321b8..7fc9fe2eb 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -10,7 +10,6 @@ from transformers import LlamaConfig
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import QKVParallelLinear
@@ -95,13 +94,11 @@ class LlamaModel(nn.Module):
             speculative_config.draft_model_config.hf_config
         self.vocab_size = self.config.vocab_size
 
-        # if PP disabled then draft will share embed with target
-        if get_pp_group().world_size > 1:
-            self.embed_tokens = VocabParallelEmbedding(
-                self.config.vocab_size,
-                self.config.hidden_size,
-                prefix=maybe_prefix(prefix, "embed_tokens"),
-            )
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+        )
 
         self.layers = nn.ModuleList([
             LlamaDecoderLayer(
@@ -240,6 +237,7 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         model_weights = {}
         includes_draft_id_mapping = False
+        includes_embed_tokens = False
         for name, loaded_weight in weights:
             if "t2d" in name:
                 continue
@@ -248,12 +246,18 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
                 includes_draft_id_mapping = True
             elif "lm_head" not in name:
                 name = "model." + name
+            if "embed_tokens" in name:
+                includes_embed_tokens = True
             model_weights[name] = loaded_weight
 
+        skip_substrs = []
+        if not includes_draft_id_mapping:
+            skip_substrs.append("draft_id_to_target_id")
+        if not includes_embed_tokens:
+            skip_substrs.append("embed_tokens")
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=None,
-            skip_substrs=["draft_id_to_target_id"] \
-                if not includes_draft_id_mapping else None,
+            skip_substrs=skip_substrs,
         )
         loader.load_weights(model_weights.items())
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 8bbda94e5..8ad66776c 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -173,6 +173,7 @@ class CudaPlatformBase(Platform):
     def get_current_memory_usage(cls,
                                  device: Optional[torch.types.Device] = None
                                  ) -> float:
+        torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats(device)
         return torch.cuda.max_memory_allocated(device)
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 4f905e505..c19c0221c 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -899,6 +899,7 @@ class DeviceMemoryProfiler:
     def current_memory_usage(self) -> float:
         # Return the memory usage in bytes.
         from vllm.platforms import current_platform
+        gc.collect()
         return current_platform.get_current_memory_usage(self.device)
 
     def __enter__(self):
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 416bc8af1..4b5c9b7ec 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -330,16 +330,19 @@ class EagleProposer:
         self.attn_layer_names = list(draft_attn_layer_names)
 
         # share embed_tokens with the target model if needed
-        if get_pp_group().world_size == 1:
+        if get_pp_group().world_size == 1 \
+            and self.model.model.embed_tokens.weight.shape \
+                == target_model.model.embed_tokens.weight.shape:
             logger.info(
-                "The EAGLE head shares the same vocab embedding" \
+                "Assuming the EAGLE head shares the same vocab embedding" \
                 " with the target model."
             )
+            del self.model.model.embed_tokens
             self.model.model.embed_tokens = target_model.model.embed_tokens
         else:
             logger.info(
-                "Since PP > 1, the EAGLE head loaded its own vocab embedding" \
-                " weights instead of sharing them with the target model."
+                "The EAGLE head's vocab embedding will be loaded separately" \
+                " from the target model."
             )
 
         # share lm_head with the target model if needed
-- 
GitLab


From f8a1a2d108d290791ae1245b2ee309f38fdd7619 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Fri, 6 Jun 2025 11:47:09 +0800
Subject: [PATCH 201/274] [v1] Hybrid Memory Allocator (#17996)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/v1/core/test_kv_cache_utils.py          | 245 ++++++++--
 tests/v1/core/test_prefix_caching.py          | 450 ++++++++++++++----
 tests/v1/core/test_scheduler.py               |  16 +-
 tests/v1/core/test_specialized_manager.py     |  27 +-
 .../v1/e2e/test_correctness_sliding_window.py |   4 +-
 .../kv_connector/unit/test_nixl_connector.py  |   4 +-
 .../unit/test_remote_decode_lifecycle.py      |   4 +-
 .../unit/test_remote_prefill_lifecycle.py     |  24 +-
 tests/v1/kv_connector/unit/utils.py           |  10 +-
 tests/v1/worker/test_gpu_model_runner.py      |  27 +-
 vllm/config.py                                |  21 +
 vllm/engine/arg_utils.py                      |   8 +
 vllm/v1/core/block_pool.py                    |  69 ++-
 vllm/v1/core/kv_cache_coordinator.py          | 358 ++++++++++++++
 vllm/v1/core/kv_cache_manager.py              | 135 +++---
 vllm/v1/core/kv_cache_utils.py                | 322 +++++++++++--
 vllm/v1/core/sched/scheduler.py               |   7 +-
 vllm/v1/core/single_type_kv_cache_manager.py  | 151 +++---
 vllm/v1/kv_cache_interface.py                 |  33 +-
 vllm/v1/worker/gpu_model_runner.py            | 120 +++--
 vllm/v1/worker/tpu_model_runner.py            |  14 +-
 21 files changed, 1607 insertions(+), 442 deletions(-)
 create mode 100644 vllm/v1/core/kv_cache_coordinator.py

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 71ea43383..ab7aa0282 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -15,8 +15,8 @@ from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.core.kv_cache_utils import (
     FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
     estimate_max_model_len, generate_block_hash_extra_keys,
-    get_max_concurrency_for_kv_cache_config, hash_block_tokens,
-    hash_request_tokens, unify_kv_cache_configs)
+    get_kv_cache_config, get_max_concurrency_for_kv_cache_config,
+    hash_block_tokens, hash_request_tokens, unify_kv_cache_configs)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheTensor,
                                         SlidingWindowSpec)
@@ -63,6 +63,20 @@ def new_kv_cache_spec(block_size=16,
                              sliding_window=sliding_window)
 
 
+def new_sliding_window_spec(block_size=16,
+                            num_kv_heads=2,
+                            head_size=64,
+                            dtype=torch.float32,
+                            use_mla=False,
+                            sliding_window=1):
+    return SlidingWindowSpec(block_size=block_size,
+                             num_kv_heads=num_kv_heads,
+                             head_size=head_size,
+                             dtype=dtype,
+                             use_mla=use_mla,
+                             sliding_window=sliding_window)
+
+
 def test_none_hash(monkeypatch):
     import vllm.v1.core.kv_cache_utils
 
@@ -403,10 +417,10 @@ def test_unify_kv_cache_configs():
     same_kv_cache_config = [
         KVCacheConfig(
             num_blocks=10,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
                 KVCacheGroupSpec(["layer2"],
@@ -415,10 +429,10 @@ def test_unify_kv_cache_configs():
         ),
         KVCacheConfig(
             num_blocks=20,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
                 KVCacheGroupSpec(["layer2"],
@@ -433,10 +447,10 @@ def test_unify_kv_cache_configs():
     need_sort_kv_cache_config = [
         KVCacheConfig(
             num_blocks=10,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
                 KVCacheGroupSpec(["layer2"],
@@ -445,10 +459,10 @@ def test_unify_kv_cache_configs():
         ),
         KVCacheConfig(
             num_blocks=20,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer2"],
                                  new_kv_cache_spec(num_kv_heads=4)),
@@ -464,10 +478,10 @@ def test_unify_kv_cache_configs():
     diff_kv_cache_config = [
         KVCacheConfig(
             num_blocks=10,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
                 KVCacheGroupSpec(["layer2"],
@@ -476,10 +490,10 @@ def test_unify_kv_cache_configs():
         ),
         KVCacheConfig(
             num_blocks=20,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
                 KVCacheGroupSpec(["layer2"],
@@ -636,7 +650,7 @@ def test_get_max_concurrency_for_kv_cache_config():
 
     kv_cache_config_full_attention = KVCacheConfig(
         num_blocks=int(1024 * 1.5),
-        tensors={},
+        kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
                              full_attention_spec),
@@ -648,7 +662,7 @@ def test_get_max_concurrency_for_kv_cache_config():
 
     kv_cache_config_sliding_window = KVCacheConfig(
         num_blocks=129 * 3,
-        tensors={},
+        kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
                              sliding_window_spec),
@@ -660,7 +674,7 @@ def test_get_max_concurrency_for_kv_cache_config():
 
     kv_cache_config_hybrid_model = KVCacheConfig(
         num_blocks=(1024 + 129) * 3,
-        tensors={},
+        kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
                              full_attention_spec),
@@ -678,9 +692,9 @@ def test_allocate_with_lookahead():
     block_size = 4
     config = KVCacheConfig(
         num_blocks=10,
-        tensors={
-            "layer1": KVCacheTensor(100),
-        },
+        kv_cache_tensors=[
+            KVCacheTensor(size=100, shared_by=["layer1"]),
+        ],
         kv_cache_groups=[
             KVCacheGroupSpec(["layer1"],
                              new_kv_cache_spec(block_size=block_size)),
@@ -702,7 +716,7 @@ def test_allocate_with_lookahead():
         num_new_tokens=3,
         num_lookahead_tokens=2,  # Total required: 3+2=5 tokens
     )
-    assert len(blocks.blocks) == 2  # ceil(5/4)=2 blocks
+    assert len(blocks.get_block_ids()[0]) == 2  # ceil(5/4)=2 blocks
 
     # Test case 2: With precomputed blocks
     kv_cache_manager = KVCacheManager(kv_cache_config=config,
@@ -713,7 +727,7 @@ def test_allocate_with_lookahead():
         num_new_tokens=3,
         num_lookahead_tokens=2,
     )
-    assert len(blocks.blocks) == 2
+    assert len(blocks.get_block_ids()[0]) == 2
 
     # Test case 3: With precomputed blocks
     # required_blocks = ceil((3 + 4) / 4) = 2
@@ -724,4 +738,165 @@ def test_allocate_with_lookahead():
         num_new_tokens=3,
         num_lookahead_tokens=4,
     )
-    assert len(blocks.blocks) == 2
+    assert len(blocks.get_block_ids()[0]) == 2
+
+
+def test_get_kv_cache_config():
+    # pass max_model_len to pass check_enough_kv_cache_memory
+    model_config = ModelConfig(max_model_len=16)
+    vllm_config = VllmConfig(model_config=model_config)
+
+    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
+    # all layers are full attention -> single group
+    kv_cache_specs_full = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_kv_cache_spec(),
+    }
+    kv_cache_config_full = get_kv_cache_config(
+        vllm_config, kv_cache_specs_full, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_full == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())
+        ])
+
+    # all layers are sliding window -> single group
+    kv_cache_specs_sliding = {
+        'layer_1': new_sliding_window_spec(),
+        'layer_2': new_sliding_window_spec(),
+    }
+    kv_cache_config_sliding = get_kv_cache_config(
+        vllm_config, kv_cache_specs_sliding, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_sliding == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"], new_sliding_window_spec())
+        ])
+
+    # full + sliding, but disable_hybrid_kv_cache_manager
+    vllm_config.scheduler_config.disable_hybrid_kv_cache_manager = True
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_sliding_window_spec(),
+    }
+    kv_cache_config_hybrid = get_kv_cache_config(
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"],
+                             new_kv_cache_spec(sliding_window=1)),
+        ],
+    )
+    vllm_config.scheduler_config.disable_hybrid_kv_cache_manager = False
+
+    # full + sliding, with hybrid_kv_cache_manager
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_sliding_window_spec(),
+    }
+    kv_cache_config_hybrid = get_kv_cache_config(
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=64,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 64,
+                          shared_by=["layer_1", "layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1"], new_kv_cache_spec()),
+            KVCacheGroupSpec(["layer_2"], new_sliding_window_spec()),
+        ],
+    )
+
+    # 2 full + 4 sliding, 2 layers per group
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_kv_cache_spec(),
+        'layer_3': new_sliding_window_spec(),
+        'layer_4': new_sliding_window_spec(),
+        'layer_5': new_sliding_window_spec(),
+        'layer_6': new_sliding_window_spec(),
+    }
+    kv_cache_config_hybrid = get_kv_cache_config(
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_1", "layer_3", "layer_5"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2", "layer_4", "layer_6"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec()),
+            KVCacheGroupSpec(["layer_3", "layer_4"],
+                             new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_5", "layer_6"],
+                             new_sliding_window_spec()),
+        ],
+    )
+
+    # 3 full + 7 sliding, pad to 3 full + 9 sliding
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_kv_cache_spec(),
+        'layer_3': new_kv_cache_spec(),
+        'layer_4': new_sliding_window_spec(),
+        'layer_5': new_sliding_window_spec(),
+        'layer_6': new_sliding_window_spec(),
+        'layer_7': new_sliding_window_spec(),
+        'layer_8': new_sliding_window_spec(),
+        'layer_9': new_sliding_window_spec(),
+        'layer_10': new_sliding_window_spec(),
+    }
+    kv_cache_config_hybrid = get_kv_cache_config(
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 3 * 32)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_1", "layer_4", "layer_7", "layer_10"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2", "layer_5", "layer_8"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_3", "layer_6", "layer_9"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2", "layer_3"],
+                             new_kv_cache_spec()),
+            KVCacheGroupSpec(["layer_4", "layer_5", "layer_6"],
+                             new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_7", "layer_8", "layer_9"],
+                             new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_10"], new_sliding_window_spec()),
+        ],
+    )
+
+    # different hidden size, unimplemented
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(head_size=128),
+        'layer_2': new_kv_cache_spec(),
+    }
+    with pytest.raises(NotImplementedError):
+        get_kv_cache_config(vllm_config, kv_cache_specs_hybrid,
+                            mem_per_block_per_layer * 2 * 32)
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 897d181ec..bf4cb539e 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the with and without prefix caching."""
 
+import copy
 from typing import Optional
 
 import pytest
@@ -13,8 +14,8 @@ from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
-from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
-                                         hash_block_tokens)
+from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
+                                         KVCacheBlock, hash_block_tokens)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, SlidingWindowSpec)
 
@@ -47,7 +48,7 @@ def make_request(request_id,
 def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
     return KVCacheConfig(
         num_blocks=num_blocks,
-        tensors={},
+        kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec(
                 ["layer"],
@@ -57,6 +58,38 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
     )
 
 
+def make_kv_cache_config_hybrid_model(block_size: int,
+                                      num_blocks: int) -> KVCacheConfig:
+    return KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                ["layer1"],
+                FullAttentionSpec(block_size, 1, 1, torch.float32, False),
+            ),
+            KVCacheGroupSpec(
+                ["layer2"],
+                SlidingWindowSpec(block_size,
+                                  1,
+                                  1,
+                                  torch.float32,
+                                  False,
+                                  sliding_window=2 * block_size),
+            ),
+            KVCacheGroupSpec(
+                ["layer3"],
+                SlidingWindowSpec(block_size,
+                                  1,
+                                  1,
+                                  torch.float32,
+                                  False,
+                                  sliding_window=2 * block_size),
+            ),
+        ],
+    )
+
+
 @pytest.mark.parametrize("hash_algo", ["sha256", "hash"])
 def test_prefill(hash_algo):
     manager = KVCacheManager(
@@ -79,10 +112,10 @@ def test_prefill(hash_algo):
     req0 = make_request("0", all_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert len(manager.req_to_block_hashes[req0.request_id]) == 3
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[1, 2, 3, 4]]
 
@@ -92,7 +125,8 @@ def test_prefill(hash_algo):
         block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
         block_hash = hash_block_tokens(hash_fn, parent_block_hash,
                                        block_tokens)
-        assert manager.block_pool.blocks[block_id].block_hash == block_hash
+        assert manager.block_pool.blocks[
+            block_id].block_hash.block_hash == block_hash
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
 
@@ -111,10 +145,10 @@ def test_prefill(hash_algo):
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[5]]
-    for block in computed_blocks.blocks:
+    for block in computed_blocks.blocks[0]:
         assert block.ref_cnt == 2
 
     # At this point, we should have 5 free blocks left.
@@ -145,7 +179,7 @@ def test_prefill(hash_algo):
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[6]]
 
@@ -165,10 +199,10 @@ def test_prefill(hash_algo):
     # Cache miss and eviction.
     req3 = make_request("3", [99] * (16 * 10))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req3, 16 * 10,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     # This block ID order also checks the eviction order.
     assert blocks.get_block_ids() == [[7, 8, 9, 10, 4, 5, 6, 3, 2, 1]]
@@ -177,6 +211,138 @@ def test_prefill(hash_algo):
     assert manager.block_pool.free_block_queue.free_list_tail is None
 
 
+def test_prefill_hybrid_model():
+    block_size = 16
+    manager = KVCacheManager(
+        make_kv_cache_config_hybrid_model(block_size, 21),
+        max_model_len=8192,
+        enable_caching=True,
+    )
+
+    hash_fn = hash
+
+    # Complete 3 blocks (48 tokens)
+    common_token_ids = [i for i in range(3) for _ in range(block_size)]
+
+    # Fully cache miss
+    # Incomplete 1 block (7 tokens)
+    unique_token_ids = [3] * 7
+    all_token_ids = common_token_ids + unique_token_ids
+    req0 = make_request("0", all_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert len(manager.req_to_block_hashes[req0.request_id]) == 3
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req0, 55,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == [[1, 2, 3, 4], [5, 6, 7, 8],
+                                      [9, 10, 11, 12]]
+
+    # Check full block metadata
+    parent_block_hash = None
+    for length, block_ids in zip((1, 2, 3),
+                                 ((1, 5, 9), (2, 6, 10), (3, 7, 11))):
+        block_tokens = tuple(all_token_ids[(length - 1) * 16:length * 16])
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                       block_tokens)
+        for block_id in block_ids:
+            assert manager.block_pool.blocks[
+                block_id].block_hash.block_hash == block_hash
+            assert manager.block_pool.blocks[block_id].ref_cnt == 1
+        parent_block_hash = block_hash.hash_value
+
+    # Check partial block metadata
+    for block_id in (4, 8, 12):
+        assert manager.block_pool.blocks[block_id].block_hash is None
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
+
+    # Cache hit in the common prefix
+    # Incomplete 1 block (5 tokens)
+    unique_token_ids = [3] * 5
+    req1 = make_request("1", common_token_ids + unique_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert computed_blocks.get_block_ids() == [[1, 2, 3], [0, 6, 7],
+                                               [0, 10, 11]]
+    assert num_computed_tokens == 3 * 16
+    num_new_tokens = 53 - 3 * 16
+    blocks = manager.allocate_slots(req1, num_new_tokens,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == [[13], [14], [15]]
+    for block_per_group in computed_blocks.blocks:
+        for block in block_per_group:
+            if block != manager.block_pool.null_block:
+                assert block.ref_cnt == 2
+
+    block_hashes = manager.req_to_block_hashes[req1.request_id]
+    manager.free(req0)
+    manager.free(req1)
+
+    cached_block_hash_to_block_bak = copy.copy(
+        manager.block_pool.cached_block_hash_to_block)
+
+    def test_partial_request_hit(request_id: str,
+                                 hash_to_evict: list[BlockHashWithGroupId],
+                                 expect_hit_length: int):
+        req = make_request(request_id, common_token_ids + unique_token_ids)
+        for hash_with_group_id in hash_to_evict:
+            manager.block_pool.cached_block_hash_to_block.pop(
+                hash_with_group_id)
+        computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
+        assert len(manager.req_to_block_hashes[req.request_id]) == 3
+        assert num_computed_tokens == expect_hit_length * block_size
+        for block_per_group in computed_blocks.blocks:
+            assert len(block_per_group) == num_computed_tokens // block_size
+        for hash_with_group_id in hash_to_evict:
+            manager.block_pool.cached_block_hash_to_block[
+                hash_with_group_id] = cached_block_hash_to_block_bak[
+                    hash_with_group_id]
+        manager.free(req)
+
+    # Evict the blocks outside sliding window, does not affect the hit length.
+    test_partial_request_hit("2", [
+        BlockHashWithGroupId(block_hashes[0], 1),
+        BlockHashWithGroupId(block_hashes[0], 2)
+    ], 3)
+
+    # Evict the first block of full attention, makes total cache miss.
+    test_partial_request_hit("3", [
+        BlockHashWithGroupId(block_hashes[0], 0),
+    ], 0)
+
+    # Evict the last block of all layers, reduces the hit length to 2.
+    test_partial_request_hit("4", [
+        BlockHashWithGroupId(block_hashes[2], 0),
+        BlockHashWithGroupId(block_hashes[2], 1),
+        BlockHashWithGroupId(block_hashes[2], 2),
+    ], 2)
+
+    # Evict the last block of full attention, reduces the hit length to 2.
+    test_partial_request_hit("5", [BlockHashWithGroupId(block_hashes[2], 0)],
+                             2)
+
+    # Evict the last block of sliding window, reduces the hit length to 2.
+    test_partial_request_hit("6", [BlockHashWithGroupId(block_hashes[2], 1)],
+                             2)
+
+    # Evict the last block of sliding window, reduces the hit length to 2.
+    test_partial_request_hit("7", [BlockHashWithGroupId(block_hashes[2], 2)],
+                             2)
+
+    # Evict different set of blocks for full attention and sliding window makes
+    # total cache miss.
+    # The cache hit length of full attention is 1 * block_size.
+    # The cache hit length of sliding window is 2 * block_size.
+    # Then it is cache miss as the two type of layers have different hit length.
+    test_partial_request_hit("8", [
+        BlockHashWithGroupId(block_hashes[2], 0),
+        BlockHashWithGroupId(block_hashes[0], 1),
+        BlockHashWithGroupId(block_hashes[0], 2),
+    ], 0)
+
+
 def test_prefill_plp():
     '''Test prefill with APC and some prompt logprobs (plp) requests.
 
@@ -203,13 +369,13 @@ def test_prefill_plp():
     req0 = make_request("0", all_token_ids, prompt_logprobs=5)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert len(manager.req_to_block_hashes[req0.request_id]) == 0
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[1, 2, 3, 4]]
-    req0_block_hashes = [b.block_hash for b in blocks.blocks]
+    req0_block_hashes = [b.block_hash for b in blocks.blocks[0]]
 
     # Check full block metadata
     parent_block_hash = None
@@ -217,7 +383,8 @@ def test_prefill_plp():
         block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
         block_hash = hash_block_tokens(hash_fn, parent_block_hash,
                                        block_tokens)
-        assert manager.block_pool.blocks[block_id].block_hash == block_hash
+        assert manager.block_pool.blocks[
+            block_id].block_hash.block_hash == block_hash
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
 
@@ -237,10 +404,10 @@ def test_prefill_plp():
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[5]]
-    for block in computed_blocks.blocks:
+    for block in computed_blocks.blocks[0]:
         assert block.ref_cnt == 2
 
     # At this point, we should have 5 free blocks left.
@@ -269,14 +436,14 @@ def test_prefill_plp():
                         prompt_logprobs=5)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(manager.req_to_block_hashes[req2.request_id]) == 0
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req2, 55,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     block_ids = blocks.get_block_ids()
     # Duplicate cached blocks have different ids but same hashes vs request #0
-    assert [b.block_hash for b in blocks.blocks] == req0_block_hashes
+    assert [b.block_hash for b in blocks.blocks[0]] == req0_block_hashes
     assert block_ids != [[1, 2, 3, 4]]
 
     # Request #2 block hashes are valid since request #0 hashes are.
@@ -302,10 +469,10 @@ def test_decode():
     unique_token_ids = [3] * 7
     req0 = make_request("0", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[1, 2, 3, 4]]
 
@@ -314,10 +481,10 @@ def test_decode():
     for _ in range(4):
         req0.append_output_token_ids(8)
     new_blocks = manager.allocate_slots(req0, 4,
-                                        len(computed_blocks.blocks) * 16,
+                                        len(computed_blocks.blocks[0]) * 16,
                                         computed_blocks)
-    assert new_blocks is not None and len(new_blocks.blocks) == 0
-    assert manager.single_type_manager.req_to_blocks[
+    assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
+    assert manager.coordinator.single_type_managers[0].req_to_blocks[
         req0.request_id][-1].block_hash is None
 
     # Append slots with allocating a new block.
@@ -327,12 +494,12 @@ def test_decode():
     for _ in range(9 + 10):
         req0.append_output_token_ids(7)
     new_blocks = manager.allocate_slots(req0, 19,
-                                        len(computed_blocks.blocks) * 16,
+                                        len(computed_blocks.blocks[0]) * 16,
                                         computed_blocks)
-    assert new_blocks is not None and len(new_blocks.blocks) == 1
-    assert manager.single_type_manager.req_to_blocks[
+    assert new_blocks is not None and len(new_blocks.blocks[0]) == 1
+    assert manager.coordinator.single_type_managers[0].req_to_blocks[
         req0.request_id][-2].block_hash is not None
-    assert manager.single_type_manager.req_to_blocks[
+    assert manager.coordinator.single_type_managers[0].req_to_blocks[
         req0.request_id][-1].block_hash is None
 
 
@@ -346,23 +513,23 @@ def test_evict():
     last_token_id = 5 * 16 + 7
     req0 = make_request("0", list(range(last_token_id)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 5 * 16 + 7,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 6  # 5 full + 1 partial
+    assert len(blocks.blocks[0]) == 6  # 5 full + 1 partial
 
     # 3 blocks.
     req1 = make_request("1", list(range(last_token_id,
                                         last_token_id + 3 * 16)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, 3 * 16,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 3  # 3 full blocks
+    assert len(blocks.blocks[0]) == 3  # 3 full blocks
     last_token_id += 3 * 16
 
     # 10 - (6 + 3) == 1
@@ -382,7 +549,7 @@ def test_evict():
     assert computed_blocks.get_block_ids() == [[1, 2]]
     assert num_computed_tokens == 2 * 16
     blocks = manager.allocate_slots(req2, 3,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[10]]
     assert manager.block_pool.free_block_queue.num_free_blocks == 7
@@ -404,12 +571,12 @@ def test_hash_block_correct_reuse():
     num_tokens = block_size * 1
     req = make_request("0", list(range(num_tokens)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req, num_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 1
+    assert len(blocks.blocks[0]) == 1
 
     # Deallocate the block.
     manager.free(req)
@@ -418,15 +585,15 @@ def test_hash_block_correct_reuse():
     # block is cleared.
     req = make_request("1", list(range(num_tokens - 1)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req, num_tokens - 1,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 1
+    assert len(blocks.blocks[0]) == 1
 
-    assert manager.block_pool.blocks[
-        blocks.blocks[0].block_id].block_hash is None
+    assert manager.block_pool.blocks[blocks.blocks[0]
+                                     [0].block_id].block_hash is None
 
 
 def test_computed_blocks_not_evicted():
@@ -445,24 +612,24 @@ def test_computed_blocks_not_evicted():
     num_tokens = block_size * 1
     req0 = make_request("0", list(range(num_tokens)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, num_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 1
-    assert blocks.blocks[0].block_id == 1
+    assert len(blocks.blocks[0]) == 1
+    assert blocks.blocks[0][0].block_id == 1
 
     # Allocate another block.
     req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, num_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 1
-    assert blocks.blocks[0].block_id == 2
+    assert len(blocks.blocks[0]) == 1
+    assert blocks.blocks[0][0].block_id == 2
 
     # Free the blocks.
     manager.free(req0)
@@ -472,15 +639,15 @@ def test_computed_blocks_not_evicted():
     # cached block rather than the first one.
     req2 = make_request("2", list(range(num_tokens * 2)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert len(computed_blocks.blocks) == 1
-    assert computed_blocks.blocks[0].block_id == 1
+    assert len(computed_blocks.blocks[0]) == 1
+    assert computed_blocks.blocks[0][0].block_id == 1
     assert num_computed_tokens == block_size
 
     blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 1
-    assert blocks.blocks[0].block_id == 2
+    assert len(blocks.blocks[0]) == 1
+    assert blocks.blocks[0][0].block_id == 2
 
 
 def test_basic_prefix_caching_disabled():
@@ -497,12 +664,12 @@ def test_basic_prefix_caching_disabled():
     req1 = make_request("1", list(range(10)))  # 2 blocks and some more
 
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, 10,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 3
+    assert len(blocks.blocks[0]) == 3
 
     # Free the blocks.
     manager.free(req1)
@@ -510,20 +677,20 @@ def test_basic_prefix_caching_disabled():
     # No caching.
     req2 = make_request("2", list(range(16)))  # shared prefix
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req2, 16,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 4
+    assert len(blocks.blocks[0]) == 4
 
     # New requests should not have any blocks.
     req3 = make_request("3", list(range(4)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req3, 4,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert not blocks
 
@@ -558,6 +725,7 @@ def test_cache_blocks(hash_fn):
         num_full_blocks=2,
         block_size=block_size,
         hash_fn=hash_fn,
+        kv_cache_group_id=0,
     )
 
     assert len(block_pool.cached_block_hash_to_block) == 2
@@ -573,11 +741,83 @@ def test_cache_blocks(hash_fn):
         num_full_blocks=3,
         block_size=block_size,
         hash_fn=hash_fn,
+        kv_cache_group_id=0,
     )
     assert len(block_pool.cached_block_hash_to_block) == 3
     assert blocks[0].block_hash is not None
 
 
+def test_cache_blocks_multi_group():
+    """
+    This tests that blocks are cached correctly for different kv cache groups.
+    """
+    block_size = 4
+    block_pool = BlockPool(num_gpu_blocks=10, enable_caching=True)
+
+    # Req:
+    #  Block 0/4: [0, 1, 2, 3]
+    #  Block 1/5: [4, 5, 6, 7]
+    #  Block 2/6: [8, 9, 10, 11]
+    #  Block 3/7: [12, 13]
+    req = make_request("0", list(range(14)))
+
+    # Cache the blocks for group 0.
+    blocks = [KVCacheBlock(block_id=i) for i in range(2)]
+    block_hashes: list[BlockHash] = []
+    block_pool.cache_full_blocks(
+        request=req,
+        blocks=blocks,
+        block_hashes=block_hashes,
+        num_cached_blocks=0,
+        num_full_blocks=2,
+        block_size=block_size,
+        hash_fn=hash,
+        kv_cache_group_id=0,
+    )
+    assert len(block_pool.cached_block_hash_to_block) == 2
+    assert len(block_hashes) == 2
+    assert all([block.block_hash is not None for block in blocks])
+
+    # Cache the blocks for group 1.
+    blocks = [KVCacheBlock(block_id=i) for i in range(3)]
+    block_pool.cache_full_blocks(
+        request=req,
+        blocks=blocks,
+        block_hashes=block_hashes,
+        num_cached_blocks=0,
+        num_full_blocks=3,
+        block_size=block_size,
+        hash_fn=hash,
+        kv_cache_group_id=1,
+    )
+    assert len(block_pool.cached_block_hash_to_block) == 5
+    assert len(block_hashes) == 3
+    assert all([block.block_hash is not None for block in blocks])
+
+    # Block hash 0: hit for group 0 and 1
+    # Block hash 1: hit for group 0 and 1
+    # Block hash 2: hit for group 1
+
+    assert block_pool.get_cached_block(block_hashes[0],
+                                       kv_cache_group_ids=[0]) is not None
+    assert block_pool.get_cached_block(block_hashes[1],
+                                       kv_cache_group_ids=[0]) is not None
+    assert block_pool.get_cached_block(block_hashes[2],
+                                       kv_cache_group_ids=[0]) is None
+    assert block_pool.get_cached_block(block_hashes[0],
+                                       kv_cache_group_ids=[1]) is not None
+    assert block_pool.get_cached_block(block_hashes[1],
+                                       kv_cache_group_ids=[1]) is not None
+    assert block_pool.get_cached_block(block_hashes[2],
+                                       kv_cache_group_ids=[1]) is not None
+    assert block_pool.get_cached_block(block_hashes[0],
+                                       kv_cache_group_ids=[0, 1]) is not None
+    assert block_pool.get_cached_block(block_hashes[1],
+                                       kv_cache_group_ids=[0, 1]) is not None
+    assert block_pool.get_cached_block(block_hashes[2],
+                                       kv_cache_group_ids=[0, 1]) is None
+
+
 def test_mm_prefix_caching():
     """
     This tests that the multi-modal prefix caching is correct.
@@ -614,7 +854,7 @@ def test_mm_prefix_caching():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
 
     # Completed block should have hashes with extra keys.
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     block_hashes = manager.req_to_block_hashes[req0.request_id]
     assert len(block_hashes) == 3
@@ -623,7 +863,7 @@ def test_mm_prefix_caching():
     assert block_hashes[2].extra_keys == ("bbb", )
 
     blocks = manager.allocate_slots(req0, 59,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[1, 2, 3, 4]]
     req0.num_computed_tokens = 59
@@ -632,9 +872,9 @@ def test_mm_prefix_caching():
     for _ in range(5):
         req0.append_output_token_ids(8)
     new_blocks = manager.allocate_slots(req0, 5,
-                                        len(computed_blocks.blocks) * 16,
+                                        len(computed_blocks.blocks[0]) * 16,
                                         computed_blocks)
-    assert new_blocks is not None and len(new_blocks.blocks) == 0
+    assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
 
     # The just completed block should have hashes with extra keys.
     assert len(block_hashes) == 4
@@ -652,7 +892,7 @@ def test_mm_prefix_caching():
                         mm_positions=mm_positions,
                         mm_hashes=mm_hashes)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert len(computed_blocks.blocks) == 3
+    assert len(computed_blocks.blocks[0]) == 3
     assert num_computed_tokens == 3 * 16
 
 
@@ -675,7 +915,7 @@ def test_cache_key_salting():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
 
     # Completed block should have hashes with extra keys.
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     block_hashes = manager.req_to_block_hashes[req0.request_id]
     assert len(block_hashes) == 3
@@ -684,7 +924,7 @@ def test_cache_key_salting():
     assert block_hashes[2].extra_keys is None
 
     blocks = manager.allocate_slots(req0, 59,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[1, 2, 3, 4]]
     req0.num_computed_tokens = 59
@@ -693,9 +933,9 @@ def test_cache_key_salting():
     for _ in range(5):
         req0.append_output_token_ids(8)
     new_blocks = manager.allocate_slots(req0, 5,
-                                        len(computed_blocks.blocks) * 16,
+                                        len(computed_blocks.blocks[0]) * 16,
                                         computed_blocks)
-    assert new_blocks is not None and len(new_blocks.blocks) == 0
+    assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
 
     # Now one more block that should not have extra keys.
     assert len(block_hashes) == 4
@@ -706,14 +946,14 @@ def test_cache_key_salting():
     req1 = make_request("1", token_ids, cache_salt="salt1")
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     # Should match only a prefix of 3 blocks.
-    assert len(computed_blocks.blocks) == 3
+    assert len(computed_blocks.blocks[0]) == 3
     assert num_computed_tokens == 3 * block_size
 
     # Test cache miss with same content but different salt.
     token_ids = common_token_ids + [4] * 11
     req2 = make_request("2", token_ids, cache_salt="salt2")
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert len(computed_blocks.blocks) == 0
+    assert len(computed_blocks.blocks[0]) == 0
     assert num_computed_tokens == 0
     block_hashes = manager.req_to_block_hashes[req2.request_id]
     assert len(block_hashes) == 3
@@ -738,20 +978,24 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     common_token_ids = [i for i in range(3) for _ in range(16)]
     req0 = make_request("0", common_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     manager.allocate_slots(req0, 48,
-                           len(computed_blocks.blocks) * 16, computed_blocks)
-    block_part0 = manager.single_type_manager.req_to_blocks[req0.request_id]
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
+    block_part0 = manager.coordinator.single_type_managers[0].req_to_blocks[
+        req0.request_id]
 
     # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
     req1 = make_request("1", common_token_ids * 2)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert computed_blocks.blocks == block_part0
+    assert computed_blocks.blocks[0] == block_part0
     assert num_computed_tokens == 3 * 16
     manager.allocate_slots(req1, 48,
-                           len(computed_blocks.blocks) * 16, computed_blocks)
-    block_part1 = manager.single_type_manager.req_to_blocks[req1.request_id]
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
+    block_part1 = manager.coordinator.single_type_managers[0].req_to_blocks[
+        req1.request_id]
     # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
     # | Req1-5(F)| ... |
     manager.free(req1)
@@ -762,10 +1006,11 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     # | Req1-5(F)| Req2-0   | Req2-1   | ... |
     req2 = make_request("2", [7] * block_size * 2)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     manager.allocate_slots(req2, block_size * 2,
-                           len(computed_blocks.blocks) * 16, computed_blocks)
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
 
     # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
     # but it cannot be allocated due to insufficient free blocks (2).
@@ -773,11 +1018,11 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     assert manager.block_pool.free_block_queue.num_free_blocks == 5
     req3 = make_request("3", common_token_ids * 3)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
-    assert computed_blocks.blocks == block_part1
+    assert computed_blocks.blocks[0] == block_part1
     assert num_computed_tokens == 6 * 16
     # Req3 cannot be allocated.
     assert manager.allocate_slots(req3, 48,
-                                  len(computed_blocks.blocks) * 16,
+                                  len(computed_blocks.blocks[0]) * 16,
                                   computed_blocks) is None
     # Block 0-2 are used by Req 1.
     assert {block.ref_cnt for block in block_part1[:3]} == {1}
@@ -804,9 +1049,9 @@ def test_reset_prefix_cache():
     req1 = make_request("1", all_token_ids)
     computed_blocks, _ = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert len(computed_blocks.blocks) == 3
+    assert len(computed_blocks.blocks[0]) == 3
     blocks = manager.allocate_slots(req1, 7,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[5]]
 
@@ -836,10 +1081,11 @@ def test_prefix_cache_stats_disabled():
     # Call all functions that check whether log_stats is disabled.
     req = make_request("0", list(range(16)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     manager.allocate_slots(req, 16,
-                           len(computed_blocks.blocks) * 16, computed_blocks)
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
     manager.reset_prefix_cache()
 
     # Ensure prefix_cache_stats remains None
@@ -918,7 +1164,8 @@ def test_eagle_enabled_removes_last_block():
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
     manager.allocate_slots(req, len(token_ids),
-                           len(computed_blocks.blocks) * 16, computed_blocks)
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
     manager.free(req)
 
     # New request with same tokens + Eagle enabled
@@ -928,7 +1175,7 @@ def test_eagle_enabled_removes_last_block():
     # Should retain 1 block:
     # 1. Original 3 blocks → pop last hash → 2 matched blocks
     # 2. drop last matched block → 1 remaining block
-    assert len(computed_blocks.blocks) == 1
+    assert len(computed_blocks.blocks[0]) == 1
     assert num_tokens == 1 * block_size  # 16 tokens
 
 
@@ -948,14 +1195,15 @@ def test_eagle_with_partial_blocks():
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
     manager.allocate_slots(req, len(token_ids),
-                           len(computed_blocks.blocks) * 16, computed_blocks)
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
     manager.free(req)
 
     # New request with Eagle enabled
     req_eagle = make_request("partial_eagle", token_ids)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
     # Original match: 2 full blocks → Eagle removes 1 → 1 remaining
-    assert len(computed_blocks.blocks) == 1
+    assert len(computed_blocks.blocks[0]) == 1
     assert num_tokens == 1 * block_size
 
 
@@ -973,7 +1221,7 @@ def test_eagle_with_sliding_window():
     manager = KVCacheManager(
         KVCacheConfig(
             num_blocks=10,
-            tensors={},
+            kv_cache_tensors=[],
             kv_cache_groups=[KVCacheGroupSpec(['layer'], sliding_window_spec)],
         ),
         max_model_len=8192,
@@ -988,7 +1236,8 @@ def test_eagle_with_sliding_window():
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
     manager.allocate_slots(req, len(token_ids),
-                           len(computed_blocks.blocks) * 16, computed_blocks)
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
     # record the block hash of the first block in the request for later use
     block_hash_first_block = manager.req_to_block_hashes[req.request_id][0]
     assert block_hash_first_block is not None
@@ -998,13 +1247,14 @@ def test_eagle_with_sliding_window():
     req_eagle = make_request("partial_eagle", token_ids)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
     # Original match: 2 full blocks → Eagle removes 1 → 1 remaining
-    assert len(computed_blocks.blocks) == 1
+    assert len(computed_blocks.blocks[0]) == 1
     assert num_tokens == 1 * block_size
 
     # Evict the first block in the request
     assert manager.block_pool.get_cached_block(
-        block_hash_first_block) is not None
-    manager.block_pool.cached_block_hash_to_block.pop(block_hash_first_block)
+        block_hash_first_block, kv_cache_group_ids=[0]) is not None
+    manager.block_pool.cached_block_hash_to_block.pop(
+        BlockHashWithGroupId(block_hash_first_block, 0))
 
     # New request
     req_after_evict = make_request("partial_eagle_after_evict", token_ids)
@@ -1012,5 +1262,5 @@ def test_eagle_with_sliding_window():
     # Cache miss. The only hit prefix is [NULL_BLOCK, BLOCK_2] if eagle is
     # not considered. But after dropping the last matched block due to eagle,
     # there will be no matched prefix.
-    assert len(computed_blocks.blocks) == 0
+    assert len(computed_blocks.blocks[0]) == 0
     assert num_tokens == 0
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index aa074f1bb..d348956aa 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -97,7 +97,7 @@ def create_scheduler(
     )
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
-        tensors={},
+        kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec(['layer'],
                              FullAttentionSpec(block_size, 1, 1, torch.float32,
@@ -814,10 +814,10 @@ def _assert_right_kv_cache_manager(
     # Make sure the request stats are right.
     EXPECTED_TOTAL_BLOCKS = num_tokens // block_size
     for req_id in req_ids:
-        blocks = (scheduler.kv_cache_manager.single_type_manager.
-                  req_to_blocks[req_id])
+        blocks = (scheduler.kv_cache_manager.coordinator.
+                  single_type_managers[0].req_to_blocks[req_id])
         hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id]
-        assert (scheduler.kv_cache_manager.single_type_manager.
+        assert (scheduler.kv_cache_manager.coordinator.single_type_managers[0].
                 num_cached_block[req_id] == EXPECTED_TOTAL_BLOCKS)
         assert len(blocks) == EXPECTED_TOTAL_BLOCKS
         assert len(hashes) == EXPECTED_TOTAL_BLOCKS
@@ -1198,11 +1198,11 @@ def assert_scheduler_empty(scheduler: Scheduler):
     assert len(scheduler.encoder_cache_manager.cached) == 0
 
     # KVCache Manager.
-    assert len(
-        scheduler.kv_cache_manager.single_type_manager.req_to_blocks) == 0
+    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+               req_to_blocks) == 0
     assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
-    assert len(
-        scheduler.kv_cache_manager.single_type_manager.num_cached_block) == 0
+    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+               num_cached_block) == 0
     num_free_blocks = (
         scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
     assert num_free_blocks == (
diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py
index 92ce8ea8b..a9e1898df 100644
--- a/tests/v1/core/test_specialized_manager.py
+++ b/tests/v1/core/test_specialized_manager.py
@@ -4,7 +4,8 @@
 import torch
 
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
+                                         KVCacheBlock)
 from vllm.v1.core.single_type_kv_cache_manager import SlidingWindowManager
 from vllm.v1.kv_cache_interface import SlidingWindowSpec
 
@@ -12,9 +13,8 @@ from vllm.v1.kv_cache_interface import SlidingWindowSpec
 def get_sliding_window_manager(sliding_window_spec, block_pool):
     return SlidingWindowManager(sliding_window_spec,
                                 block_pool,
-                                use_eagle=False,
-                                num_kv_cache_groups=1,
-                                caching_hash_fn=lambda x: x)
+                                caching_hash_fn=lambda x: x,
+                                kv_cache_group_id=0)
 
 
 def test_sliding_window_possible_cached_prefix():
@@ -42,13 +42,18 @@ def test_sliding_window_possible_cached_prefix():
         for i, (block_hash,
                 is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
             if is_cached:
-                block_pool.cached_block_hash_to_block[block_hash] = {
-                    i: block_pool.blocks[i + 10]
-                }
+                block_pool.cached_block_hash_to_block[BlockHashWithGroupId(
+                    block_hash, 0)] = {
+                        i: block_pool.blocks[i + 10],
+                    }
 
         computed_blocks = manager.find_longest_cache_hit(
-            block_hash_list,
-            len(block_hash_list) * block_size)
+            block_hashes=block_hash_list,
+            max_length=len(block_hash_list) * block_size,
+            kv_cache_group_ids=[0],
+            block_pool=block_pool,
+            kv_cache_spec=sliding_window_spec,
+            use_eagle=False)[0]
         assert len(computed_blocks) == expect_length
 
         assert all(block == block_pool.null_block
@@ -95,13 +100,13 @@ def test_sliding_window_remove_skipped_blocks():
 
     null_block_id = block_pool.null_block.block_id
 
-    def id_to_block_table(ids):
+    def id_to_block_table(ids) -> list[KVCacheBlock]:
         return [
             KVCacheBlock(id_)
             if id_ != null_block_id else block_pool.null_block for id_ in ids
         ]
 
-    def assert_block_id(block_table, ids):
+    def assert_block_id(block_table: list[KVCacheBlock], ids: list[int]):
         for block, id_ in zip(block_table, ids):
             if id_ == null_block_id:
                 assert block == block_pool.null_block
diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
index 3eedc535d..d8882b1d9 100644
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -18,7 +18,7 @@ class TestConfig:
 
 model_config = {
     "bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)),
-    "google/gemma-2-2b-it": TestConfig(4096, (400, 800)),
+    "google/gemma-3-1b-it": TestConfig(4096, (400, 800)),
 }
 
 
@@ -26,7 +26,7 @@ model_config = {
     "model",
     [
         "bigcode/starcoder2-3b",  # sliding window only
-        "google/gemma-2-2b-it",  # sliding window + full attention
+        "google/gemma-3-1b-it",  # sliding window + full attention
     ])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 9b257143d..622ab6f35 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -36,8 +36,8 @@ def test_basic_inferface():
     req_meta = kv_connector_metadata.requests[request_id]
 
     for block_id, block in zip(
-            req_meta.local_block_ids, scheduler.kv_cache_manager.
-            single_type_manager.req_to_blocks[request_id]):
+            req_meta.local_block_ids, scheduler.kv_cache_manager.coordinator.
+            single_type_managers[0].req_to_blocks[request_id]):
         assert block_id == block.block_id
 
 
diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
index 52dc21a2c..ff36a281c 100644
--- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
@@ -54,8 +54,8 @@ def test_basic_lifecycle():
     assert len(scheduler.waiting) == 0
 
     # ... but blocks should not be freed.
-    blocks = scheduler.kv_cache_manager.single_type_manager.req_to_blocks[
-        request_id]
+    blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_id]
     for block in blocks:
         assert block.ref_cnt == 1
 
diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
index 2312e2135..a1156306d 100644
--- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
@@ -51,8 +51,8 @@ def test_basic_lifecycle():
     assert (block_pool.free_block_queue.num_free_blocks
             < START_FREE_BLOCK_QUEUE_SIZE)
     assert len(block_pool.cached_block_hash_to_block) == 0
-    blocks = scheduler.kv_cache_manager.single_type_manager.req_to_blocks[
-        request_id]
+    blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_id]
     for block in blocks:
         assert block._block_hash is None
 
@@ -87,8 +87,8 @@ def test_basic_lifecycle():
 
     # Confirm the block are actually allocated.
     num_hashed_blocks = 0
-    blocks = scheduler.kv_cache_manager.single_type_manager.req_to_blocks[
-        request_id]
+    blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_id]
     for block in blocks:
         assert block.ref_cnt == 1
         num_hashed_blocks += (1 if block._block_hash is not None else 0)
@@ -261,10 +261,10 @@ def test_no_spurious_prefix_caching():
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 1
 
-    local_blocks = scheduler.kv_cache_manager.single_type_manager.req_to_blocks[
-        request_local.request_id]
-    remote_blocks = scheduler.kv_cache_manager.single_type_manager.req_to_blocks[  # noqa: E501
-        request_remote.request_id]
+    local_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_local.request_id]
+    remote_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_remote.request_id]
 
     # Local should have cached blocks (but not all due to preallocate).
     num_hashed_blocks = 0
@@ -300,8 +300,8 @@ def test_full_block_prompt():
     # STEP (1): Initialize a recv.
     scheduler_output = scheduler.schedule()
     # All blocks should be allocated.
-    num_blocks = len(scheduler.kv_cache_manager.single_type_manager.
-                     req_to_blocks[request_id])
+    num_blocks = len(scheduler.kv_cache_manager.coordinator.
+                     single_type_managers[0].req_to_blocks[request_id])
     assert num_blocks == NUM_EXTERNAL_FULL_BLOCKS
     model_runner_output = EMPTY_MODEL_RUNNER_OUTPUT
     scheduler.update_from_output(scheduler_output, model_runner_output)
@@ -319,8 +319,8 @@ def test_full_block_prompt():
 
     # We need to recompute the final token of the prompt to generate
     # the first new token, so we should not have a new block.
-    num_blocks = len(scheduler.kv_cache_manager.single_type_manager.
-                     req_to_blocks[request_id])
+    num_blocks = len(scheduler.kv_cache_manager.coordinator.
+                     single_type_managers[0].req_to_blocks[request_id])
     assert num_blocks == NUM_EXTERNAL_FULL_BLOCKS
     assert (scheduler_output.scheduled_new_reqs[0].num_computed_tokens ==
             NUM_TOKENS - 1)
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index e190e9561..4a9e3a7ad 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -32,11 +32,11 @@ def assert_scheduler_empty(scheduler: Scheduler):
     assert len(scheduler.encoder_cache_manager.cached) == 0
 
     # KVCache Manager.
-    assert len(
-        scheduler.kv_cache_manager.single_type_manager.req_to_blocks) == 0
+    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+               req_to_blocks) == 0
     assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
-    assert len(
-        scheduler.kv_cache_manager.single_type_manager.num_cached_block) == 0
+    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+               num_cached_block) == 0
     num_free_blocks = (
         scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
     assert num_free_blocks == (
@@ -96,7 +96,7 @@ def create_scheduler(
     block_size = vllm_config.cache_config.block_size
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
-        tensors={},
+        kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec(['layer'],
                              FullAttentionSpec(block_size, 1, 1, torch.float32,
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 0553d94de..caacb1652 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -40,12 +40,13 @@ def initialize_kv_cache(runner: GPUModelRunner):
     tensor_size = attn_spec.page_size_bytes * NUM_BLOCKS
     kv_cache_config = KVCacheConfig(
         num_blocks=NUM_BLOCKS,
-        tensors={
-            "layer.0": KVCacheTensor(size=tensor_size),
-        },
+        kv_cache_tensors=[
+            KVCacheTensor(size=tensor_size, shared_by=["layer.0"]),
+        ],
         kv_cache_groups=[
             KVCacheGroupSpec(layer_names=["layer.0"], kv_cache_spec=attn_spec)
-        ])
+        ],
+    )
     runner.kv_cache_config = kv_cache_config
     runner.input_batch = InputBatch(
         max_num_reqs=runner.max_num_reqs,
@@ -518,9 +519,9 @@ def test_init_kv_cache_without_kv_sharing():
     kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                           available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
-    assert len(kv_cache_config.tensors) == 2
-    assert kv_cache_config.tensors[layer_0].size == available_memory // 2
-    assert kv_cache_config.tensors[layer_1].size == available_memory // 2
+    assert len(kv_cache_config.kv_cache_tensors) == 2
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
+    assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2
 
     max_context_len =\
         estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
@@ -530,9 +531,9 @@ def test_init_kv_cache_without_kv_sharing():
     # important: override tensor size to prevent large mem alloc during test
     # this will only allocate 2 block worth of memory (2 * 32kb)
     kv_cache_config.num_blocks = 1
-    for layer in kv_cache_config.tensors:
-        kv_cache_config.tensors[layer].size =\
-            kv_cache_spec[layer].page_size_bytes
+    for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+        kv_cache_tensor.size = (
+            kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes)
 
     runner.initialize_kv_cache(kv_cache_config)
 
@@ -589,10 +590,10 @@ def test_init_kv_cache_with_kv_sharing_valid():
     kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                           available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
-    assert len(kv_cache_config.tensors) == 1
+    assert len(kv_cache_config.kv_cache_tensors) == 1
     # Each layer now has twice the available memory for KV cache
     # compared to no KV sharing
-    assert kv_cache_config.tensors[layer_0].size == available_memory
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory
 
     max_context_len =\
         estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
@@ -602,7 +603,7 @@ def test_init_kv_cache_with_kv_sharing_valid():
     # important: override tensor size to prevent large mem alloc during test
     # this will only allocate 1 block worth of memory (32kb)
     kv_cache_config.num_blocks = 1
-    kv_cache_config.tensors[layer_0].size =\
+    kv_cache_config.kv_cache_tensors[0].size =\
         kv_cache_spec[layer_0].page_size_bytes
 
     runner.initialize_kv_cache(kv_cache_config)
diff --git a/vllm/config.py b/vllm/config.py
index a07c41dda..cd6ac4f89 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2104,6 +2104,12 @@ class SchedulerConfig:
     default scheduler. Can be a class directly or the path to a class of form
     "mod.custom_class"."""
 
+    disable_hybrid_kv_cache_manager: bool = False
+    """If set to True, KV cache manager will allocate the same size of KV cache
+    for all attention layers even if there are multiple type of attention layers
+    like full attention and sliding window attention.
+    """
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -4465,6 +4471,21 @@ class VllmConfig:
         if not self.instance_id:
             self.instance_id = random_uuid()[:5]
 
+        if (envs.VLLM_USE_V1
+                and not self.scheduler_config.disable_hybrid_kv_cache_manager):
+            # logger should only print warning message for hybrid models. As we
+            # can't know whether the model is hybrid or not now, so we don't log
+            # warning message here and will log it later.
+            if not (current_platform.is_cuda() or current_platform.is_rocm()):
+                # Hybrid KV cache manager is not supported on non-GPU platforms.
+                self.disable_hybrid_kv_cache_manager = True
+            if self.kv_transfer_config is not None:
+                # Hybrid KV cache manager is not compatible with KV transfer.
+                self.disable_hybrid_kv_cache_manager = True
+            if self.kv_events_config is not None:
+                # Hybrid KV cache manager is not compatible with KV events.
+                self.disable_hybrid_kv_cache_manager = True
+
     def update_sizes_for_sequence_parallelism(self,
                                               possible_sizes: list) -> list:
         # remove the sizes that not multiple of tp_size when
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 1ba50fec9..81f160968 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -387,6 +387,9 @@ class EngineArgs:
         bool] = SchedulerConfig.enable_chunked_prefill
     disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
 
+    disable_hybrid_kv_cache_manager: bool = (
+        SchedulerConfig.disable_hybrid_kv_cache_manager)
+
     guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend
     guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback
     guided_decoding_disable_any_whitespace: bool = \
@@ -849,6 +852,9 @@ class EngineArgs:
             **scheduler_kwargs["disable_chunked_mm_input"])
         scheduler_group.add_argument("--scheduler-cls",
                                      **scheduler_kwargs["scheduler_cls"])
+        scheduler_group.add_argument(
+            "--disable-hybrid-kv-cache-manager",
+            **scheduler_kwargs["disable_hybrid_kv_cache_manager"])
 
         # vLLM arguments
         vllm_kwargs = get_kwargs(VllmConfig)
@@ -1174,6 +1180,8 @@ class EngineArgs:
             max_num_partial_prefills=self.max_num_partial_prefills,
             max_long_partial_prefills=self.max_long_partial_prefills,
             long_prefill_token_threshold=self.long_prefill_token_threshold,
+            disable_hybrid_kv_cache_manager=self.
+            disable_hybrid_kv_cache_manager,
         )
 
         lora_config = LoRAConfig(
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 5118e4d8e..3b2a4f936 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -7,8 +7,8 @@ from typing import Callable, Optional
 from vllm.distributed.kv_events import (AllBlocksCleared, BlockRemoved,
                                         BlockStored, KVCacheEvent)
 from vllm.logger import init_logger
-from vllm.v1.core.kv_cache_utils import (BlockHash, FreeKVCacheBlockQueue,
-                                         KVCacheBlock,
+from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
+                                         FreeKVCacheBlockQueue, KVCacheBlock,
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens)
 from vllm.v1.request import Request
@@ -27,6 +27,7 @@ class BlockPool:
     Args:
         num_gpu_blocks: The number of blocks in the pool.
         enable_caching: Whether to enable prefix caching.
+        enable_kv_cache_events: Whether to enable kv cache events.
     """
 
     def __init__(
@@ -56,7 +57,7 @@ class BlockPool:
         # if there is already an identical block in the cache. This is because
         # we want to make sure the allocated block IDs won't change so that
         # block tables are append-only.
-        self.cached_block_hash_to_block: dict[BlockHash, dict[
+        self.cached_block_hash_to_block: dict[BlockHashWithGroupId, dict[
             int, KVCacheBlock]] = defaultdict(dict)
 
         # To represent a placeholder block with block_id=0.
@@ -68,22 +69,29 @@ class BlockPool:
         self.enable_kv_cache_events = enable_kv_cache_events
         self.kv_event_queue: list[KVCacheEvent] = []
 
-    def get_cached_block(self,
-                         block_hash: BlockHash) -> Optional[KVCacheBlock]:
-        """Get a cached block by the block hash, or None if cache miss.
+    def get_cached_block(
+            self, block_hash: BlockHash,
+            kv_cache_group_ids: list[int]) -> Optional[list[KVCacheBlock]]:
+        """Get the cached block by the block hash for each group in 
+        `kv_cache_group_ids`, or None if cache miss for any group.
         If there are duplicated blocks, we return the first block in the cache.
 
         Args:
             block_hash: The hash value of the block.
+            kv_cache_group_ids: The ids of the KV cache groups.
 
         Returns:
-            The cached block if it exists, or None.
+            The cached blocks if exists, or None.
         """
-        cached_blocks = self.cached_block_hash_to_block.get(block_hash)
-        if not cached_blocks:
-            return None
-        first_block_id = next(iter(cached_blocks))
-        return cached_blocks[first_block_id]
+        cached_blocks = []
+        for group_id in kv_cache_group_ids:
+            cached_blocks_one_group = self.cached_block_hash_to_block.get(
+                BlockHashWithGroupId(block_hash, group_id))
+            if not cached_blocks_one_group:
+                return None
+            first_block_id = next(iter(cached_blocks_one_group))
+            cached_blocks.append(cached_blocks_one_group[first_block_id])
+        return cached_blocks
 
     def cache_full_blocks(
         self,
@@ -93,6 +101,7 @@ class BlockPool:
         num_cached_blocks: int,
         num_full_blocks: int,
         block_size: int,
+        kv_cache_group_id: int,
         hash_fn: Callable,
     ) -> None:
         """Cache a list of full blocks for prefix caching.
@@ -112,6 +121,7 @@ class BlockPool:
             num_full_blocks: The number of blocks that are full and should
                 be cached after this function.
             block_size: Number of tokens in each block.
+            kv_cache_group_id: The id of the KV cache group.
             hash_fn: The hash function to use for block hashes.
         """
         if num_cached_blocks == num_full_blocks:
@@ -126,7 +136,7 @@ class BlockPool:
         else:
             prev_block = blocks[num_cached_blocks - 1]
             assert prev_block.block_hash is not None
-            prev_block_hash_value = prev_block.block_hash.hash_value
+            prev_block_hash_value = prev_block.block_hash.get_hash_value()
 
         parent_block_hash = prev_block_hash_value
         new_hashes: Optional[list[int]] = ([] if self.enable_kv_cache_events
@@ -138,8 +148,9 @@ class BlockPool:
                 # The block hash may already be computed in
                 # "get_computed_blocks" if the tokens are not generated by
                 # this request (either the prompt tokens or the previously
-                # generated tokens with preemption). In this case we simply
-                # reuse the block hash.
+                # generated tokens with preemption), or by other
+                # single_type_managers with the same block_size.
+                # In this case we simply reuse the block hash.
                 block_hash = new_block_hashes[i]
             else:
                 # Otherwise compute the block hash and cache it in the request
@@ -166,8 +177,11 @@ class BlockPool:
                 block_hashes.append(block_hash)
 
             # Update and added the full block to the cache.
-            blk.block_hash = block_hash
-            self.cached_block_hash_to_block[block_hash][blk.block_id] = blk
+            block_hash_with_group_id = BlockHashWithGroupId(
+                block_hash, kv_cache_group_id)
+            blk.block_hash = block_hash_with_group_id
+            self.cached_block_hash_to_block[block_hash_with_group_id][
+                blk.block_id] = blk
             if new_hashes is not None:
                 new_hashes.append(block_hash.hash_value)
             prev_block_hash_value = block_hash.hash_value
@@ -237,12 +251,16 @@ class BlockPool:
                 del self.cached_block_hash_to_block[block_hash]
 
             if self.enable_kv_cache_events:
+                # FIXME (Chen): Not sure whether we should return `hash_value`
+                # or `(hash_value, group_id)` here. But it's fine now because
+                # we disable hybrid kv cache manager when kv cache event is
+                # enabled, so there is only one group.
                 self.kv_event_queue.append(
-                    BlockRemoved(block_hashes=[block_hash.hash_value]))
+                    BlockRemoved(block_hashes=[block_hash.get_hash_value()]))
             return True
         return False
 
-    def touch(self, blocks: list[KVCacheBlock]) -> None:
+    def touch(self, blocks: list[list[KVCacheBlock]]) -> None:
         """Touch a block increases its reference count by 1, and may remove
         the block from the free queue. This is used when a block is hit by
         another request with the same prefix.
@@ -250,12 +268,13 @@ class BlockPool:
         Args:
             blocks: A list of blocks to touch.
         """
-        for block in blocks:
-            # ref_cnt=0 means this block is in the free list (i.e. eviction
-            # candidate), so remove it.
-            if block.ref_cnt == 0 and not block.is_null:
-                self.free_block_queue.remove(block)
-            block.incr_ref()
+        for blocks_per_group in blocks:
+            for block in blocks_per_group:
+                # ref_cnt=0 means this block is in the free list (i.e. eviction
+                # candidate), so remove it.
+                if block.ref_cnt == 0 and not block.is_null:
+                    self.free_block_queue.remove(block)
+                block.incr_ref()
 
     def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
         """Free a list of blocks. The blocks should be ordered by their
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
new file mode 100644
index 000000000..993ce4b48
--- /dev/null
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -0,0 +1,358 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from typing import Callable, Optional
+
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.core.single_type_kv_cache_manager import (
+    FullAttentionManager, SingleTypeKVCacheManager,
+    get_manager_for_kv_cache_spec)
+from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig
+from vllm.v1.request import Request
+
+
+class KVCacheCoordinator(ABC):
+    """
+    Coordinate the KV cache of different KV cache groups.
+    """
+
+    def __init__(
+        self,
+        kv_cache_config: KVCacheConfig,
+        max_model_len: int,
+        use_eagle: bool,
+        enable_caching: bool,
+        caching_hash_fn: Callable,
+        enable_kv_cache_events: bool,
+    ):
+        self.kv_cache_config = kv_cache_config
+        self.max_model_len = max_model_len
+
+        self.block_pool = BlockPool(kv_cache_config.num_blocks, enable_caching,
+                                    enable_kv_cache_events)
+        self.single_type_managers: list[SingleTypeKVCacheManager] = []
+
+        # Needs special handling for find_longest_cache_hit if eagle is enabled
+        self.use_eagle = use_eagle
+
+        for i in range(len(self.kv_cache_config.kv_cache_groups)):
+            kv_cache_spec = self.kv_cache_config.kv_cache_groups[
+                i].kv_cache_spec
+            self.single_type_managers.append(
+                get_manager_for_kv_cache_spec(
+                    kv_cache_spec=kv_cache_spec,
+                    block_pool=self.block_pool,
+                    kv_cache_group_id=i,
+                    caching_hash_fn=caching_hash_fn,
+                ))
+
+    def get_num_blocks_to_allocate(
+            self, request_id: str, num_tokens: int,
+            new_computed_blocks: list[list[KVCacheBlock]]) -> int:
+        """
+        Get the number of blocks needed to be allocated for the request.
+
+        Args:
+            request_id: The request ID.
+            num_tokens: The total number of tokens that need a slot (including 
+                tokens that are already allocated).
+            new_computed_blocks: The new computed blocks just hitting the
+                prefix caching.
+
+        Returns:
+            The number of blocks.
+        """
+        num_blocks_to_allocate = 0
+        for i, manager in enumerate(self.single_type_managers):
+            num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
+                request_id, num_tokens, new_computed_blocks[i])
+        return num_blocks_to_allocate
+
+    def save_new_computed_blocks(
+            self, request_id: str,
+            new_computed_blocks: list[list[KVCacheBlock]]) -> None:
+        """
+        Add the new computed blocks to the request.
+
+        Args:
+            request_id: The request ID.
+            new_computed_blocks: The new computed blocks just hitting the
+                prefix cache.
+        """
+        for i, manager in enumerate(self.single_type_managers):
+            manager.save_new_computed_blocks(request_id,
+                                             new_computed_blocks[i])
+
+    def allocate_new_blocks(self, request_id: str,
+                            num_tokens: int) -> list[list[KVCacheBlock]]:
+        """
+        Allocate new blocks for the request to give it at least `num_tokens` 
+        token slots.
+
+        Args:
+            request_id: The request ID.
+            num_tokens: The total number of tokens that need a slot (including 
+                tokens that are already allocated).
+
+        Returns:
+            The new allocated blocks.
+        """
+        new_blocks = []
+        for manager in self.single_type_managers:
+            new_blocks.append(
+                manager.allocate_new_blocks(request_id, num_tokens))
+        return new_blocks
+
+    def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
+                     num_computed_tokens: int) -> None:
+        """
+        Cache the blocks for the request.
+
+        Args:
+            request: The request.
+            block_hashes: The block hashes of the request.
+            num_tokens: The total number of tokens that need to be cached 
+                (including tokens that are already cached).
+        """
+        for manager in self.single_type_managers:
+            manager.cache_blocks(request, block_hashes, num_computed_tokens)
+
+    def free(self, request_id: str) -> None:
+        """
+        Free the blocks for the request.
+
+        Args:
+            request_id: The request ID.
+        """
+        for manager in self.single_type_managers:
+            manager.free(request_id)
+
+    def get_num_common_prefix_blocks(self, request_id: str,
+                                     num_running_requests: int) -> list[int]:
+        """
+        Get the number of common prefix blocks for a request.
+
+        Args:
+            request_id: The request ID.
+            block_hashes: The block hashes of the request.
+
+        Returns:
+            The number of common prefix blocks.
+        """
+        num_blocks_per_group = [
+            manager.get_num_common_prefix_blocks(request_id,
+                                                 num_running_requests)
+            for manager in self.single_type_managers
+        ]
+        return num_blocks_per_group
+
+    def remove_skipped_blocks(self, request_id: str,
+                              num_computed_tokens: int) -> None:
+        """
+        Remove the blocks that are no longer needed from `blocks` and replace 
+        the removed blocks with null_block.
+
+        Args:
+            request_id: The request ID.
+            num_computed_tokens: The number of tokens that have been computed.
+        """
+        for manager in self.single_type_managers:
+            manager.remove_skipped_blocks(request_id, num_computed_tokens)
+
+    def get_blocks(self, request_id: str) -> list[list[KVCacheBlock]]:
+        """
+        Get the blocks for the request.
+        """
+        return [
+            manager.req_to_blocks[request_id]
+            for manager in self.single_type_managers
+        ]
+
+    @abstractmethod
+    def find_longest_cache_hit(
+            self, block_hashes: list[BlockHash],
+            max_cache_hit_length: int) -> tuple[list[list[KVCacheBlock]], int]:
+        pass
+
+
+class UnitaryKVCacheCoordinator(KVCacheCoordinator):
+    """
+    KV cache coordinator for models with only one KV cache group. This is the
+    case for models with only one KV cache type, e.g., all attention layers use
+    full attention or all attention layers use sliding window attention.
+    """
+
+    def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
+                 use_eagle: bool, enable_caching: bool,
+                 caching_hash_fn: Callable, enable_kv_cache_events: bool):
+        super().__init__(kv_cache_config, max_model_len, use_eagle,
+                         enable_caching, caching_hash_fn,
+                         enable_kv_cache_events)
+        self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[
+            0].kv_cache_spec
+        self.block_size = self.kv_cache_spec.block_size
+        assert len(self.kv_cache_config.kv_cache_groups) == 1, (
+            "UnitaryKVCacheCoordinator assumes only one kv cache group")
+
+    def find_longest_cache_hit(
+            self, block_hashes: list[BlockHash],
+            max_cache_hit_length: int) -> tuple[list[list[KVCacheBlock]], int]:
+        hit_blocks = self.single_type_managers[0].find_longest_cache_hit(
+            block_hashes=block_hashes,
+            max_length=max_cache_hit_length,
+            kv_cache_group_ids=[0],
+            block_pool=self.block_pool,
+            kv_cache_spec=self.kv_cache_spec,
+            use_eagle=self.use_eagle,
+        )
+        return hit_blocks, len(hit_blocks[0]) * self.block_size
+
+
+class HybridKVCacheCoordinator(KVCacheCoordinator):
+    """
+    KV cache coordinator for hybrid models with multiple KV cache types, and
+    thus multiple kv cache groups.
+    To simplify `find_longest_cache_hit`, it only supports the combination of 
+    two types of KV cache groups, and one of them must be full attention.
+    May extend to more general cases in the future.
+    """
+
+    def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
+                 use_eagle: bool, enable_caching: bool,
+                 caching_hash_fn: Callable, enable_kv_cache_events: bool):
+        super().__init__(kv_cache_config, max_model_len, use_eagle,
+                         enable_caching, caching_hash_fn,
+                         enable_kv_cache_events)
+        self.verify_and_split_kv_cache_groups()
+
+    def verify_and_split_kv_cache_groups(self) -> None:
+        """
+        Verifies that the model has exactly two types of KV cache groups, and 
+        one of them is full attention. Then, split the kv cache groups into full
+        attention groups and other groups.
+        """
+        full_attention_type_id: Optional[str] = None
+        other_type_id: Optional[str] = None
+        self.full_attention_group_ids: list[int] = []
+        self.other_group_ids: list[int] = []
+        for i, g in enumerate(self.kv_cache_config.kv_cache_groups):
+            if isinstance(g.kv_cache_spec, FullAttentionSpec):
+                if full_attention_type_id is None:
+                    full_attention_type_id = g.kv_cache_spec.type_id
+                else:
+                    assert full_attention_type_id == g.kv_cache_spec.type_id, (
+                        "HybridKVCacheCoordinator assumes exactly one type of "
+                        "full attention groups now.")
+                self.full_attention_group_ids.append(i)
+            else:
+                if other_type_id is None:
+                    other_type_id = g.kv_cache_spec.type_id
+                else:
+                    assert other_type_id == g.kv_cache_spec.type_id, (
+                        "HybridKVCacheCoordinator assumes "
+                        "exactly one other type of groups now.")
+                self.other_group_ids.append(i)
+
+        assert full_attention_type_id is not None, (
+            "HybridKVCacheCoordinator assumes exactly one type of full "
+            "attention groups now.")
+        assert other_type_id is not None, (
+            "HybridKVCacheCoordinator assumes exactly one type of other "
+            "groups now.")
+
+        self.full_attention_manager_cls = FullAttentionManager
+        self.other_attention_cls = self.single_type_managers[
+            self.other_group_ids[0]].__class__
+
+        self.full_attention_spec = self.kv_cache_config.kv_cache_groups[
+            self.full_attention_group_ids[0]].kv_cache_spec
+        self.other_spec = self.kv_cache_config.kv_cache_groups[
+            self.other_group_ids[0]].kv_cache_spec
+
+        self.full_attention_block_size = self.full_attention_spec.block_size
+        self.other_block_size = self.other_spec.block_size
+        assert self.other_block_size % self.full_attention_block_size == 0, (
+            "KVCacheCoordinator assumes the block_size of full attention "
+            "layers is divisible by other layers now.")
+
+    def find_longest_cache_hit(
+        self,
+        block_hashes: list[BlockHash],
+        max_cache_hit_length: int,
+    ) -> tuple[list[list[KVCacheBlock]], int]:
+        """
+        Find the longest cache hit for the request.
+
+        Args:
+            block_hashes: The block hashes of the request.
+            max_cache_hit_length: The maximum length of the cache hit.
+
+        Returns:
+            A tuple containing:
+                - A list of the cache hit blocks for each single type manager.
+                - The number of tokens of the longest cache hit.
+        """
+        # First, find the longest cache hit for full attention.
+        hit_blocks_full_attn = (
+            self.full_attention_manager_cls.find_longest_cache_hit(
+                block_hashes=block_hashes,
+                max_length=max_cache_hit_length,
+                kv_cache_group_ids=self.full_attention_group_ids,
+                block_pool=self.block_pool,
+                kv_cache_spec=self.full_attention_spec,
+                use_eagle=self.use_eagle,
+            ))
+        hit_length = len(
+            hit_blocks_full_attn[0]) * self.full_attention_block_size
+
+        # Next, find the cache hit for the other attention WITHIN
+        # the cache hit of full attention.
+        hit_blocks_other_attn = (
+            self.other_attention_cls.find_longest_cache_hit(
+                block_hashes=block_hashes,
+                max_length=hit_length,
+                kv_cache_group_ids=self.other_group_ids,
+                block_pool=self.block_pool,
+                kv_cache_spec=self.other_spec,
+                use_eagle=self.use_eagle,
+            ))
+        hit_length = len(hit_blocks_other_attn[0]) * self.other_block_size
+
+        # NOTE: the prefix cache hit length must be a multiply of block_size as
+        # we don't support partial block cache hit yet. The cache hit length
+        # of other attention is ensured to be a multiply of the block size of
+        # full attention layers in current implementation, because hit_length is
+        # a multiply of other attention's block size, and other attention's
+        # block size is a multiply of full attention's block size (verified in
+        # `verify_and_split_kv_cache_groups`).
+        assert hit_length % self.full_attention_block_size == 0
+
+        # Truncate the full attention cache hit to the length of the
+        # cache hit of the other attention.
+        for i in range(len(hit_blocks_full_attn)):
+            del hit_blocks_full_attn[i][hit_length //
+                                        self.full_attention_block_size:]
+
+        # Merge the hit blocks of full attention and other attention.
+        hit_blocks = hit_blocks_other_attn
+        for group_id, blocks in enumerate(hit_blocks_full_attn):
+            # NOTE: there is only one full attention group in most cases. So
+            # the time complexity of insert is fine.
+            hit_blocks.insert(group_id, blocks)
+        return hit_blocks, hit_length
+
+
+def get_kv_cache_coordinator(
+        kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool,
+        enable_caching: bool, caching_hash_fn: Callable,
+        enable_kv_cache_events: bool) -> KVCacheCoordinator:
+    if len(kv_cache_config.kv_cache_groups) == 1:
+        return UnitaryKVCacheCoordinator(kv_cache_config, max_model_len,
+                                         use_eagle, enable_caching,
+                                         caching_hash_fn,
+                                         enable_kv_cache_events)
+    else:
+        return HybridKVCacheCoordinator(kv_cache_config, max_model_len,
+                                        use_eagle, enable_caching,
+                                        caching_hash_fn,
+                                        enable_kv_cache_events)
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 91999d300..fc701215b 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -8,11 +8,9 @@ from typing import Optional
 from vllm.distributed.kv_events import KVCacheEvent
 from vllm.logger import init_logger
 from vllm.utils import sha256
-from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_coordinator import get_kv_cache_coordinator
 from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
                                          hash_request_tokens)
-from vllm.v1.core.single_type_kv_cache_manager import (
-    get_manager_for_kv_cache_spec)
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request, RequestStatus
@@ -22,16 +20,24 @@ logger = init_logger(__name__)
 
 @dataclass
 class KVCacheBlocks:
-    blocks: list[KVCacheBlock]
+    """
+    The allocation result of KVCacheManager, work as the interface between 
+    Scheduler and KVCacheManager, to hide KVCacheManager's internal data 
+    structure from the Scheduler.
+    """
+    blocks: list[list[KVCacheBlock]]
+    """
+    blocks[i][j] refers to the i-th kv_cache_group and the j-th block of tokens.
+    We don't use block of tokens as the outer dimension because it assumes all
+    kv_cache_groups have the same number of blocks, which is true for now but 
+    will be broken if we want to give different block_size to different 
+    kv_cache_groups in the future.
+    """
 
     def __add__(self, other: "KVCacheBlocks") -> "KVCacheBlocks":
         """Adds two KVCacheBlocks instances."""
-        return KVCacheBlocks(self.blocks + other.blocks)
-
-    @classmethod
-    def create_empty(cls) -> "KVCacheBlocks":
-        """Creates a new KVCacheBlocks instance with no blocks."""
-        return cls([])
+        return KVCacheBlocks(
+            [blk1 + blk2 for blk1, blk2 in zip(self.blocks, other.blocks)])
 
     def get_block_ids(self) -> list[list[int]]:
         """
@@ -39,15 +45,20 @@ class KVCacheBlocks:
         
         Returns:
             list[list[int]]: A two-level list where
-            * the outer list corresponds to KV cache groups (only 1 group now)
+            * the outer list corresponds to KV cache groups
             * each inner list contains the block_ids of the blocks in that group
         """
-        return [[block.block_id for block in self.blocks]]
+        block_ids = []
+        for group in self.blocks:
+            block_ids.append([blk.block_id for blk in group])
+        return block_ids
 
     def get_unhashed_block_ids(self) -> list[int]:
         """Get block_ids of unhashed blocks from KVCacheBlocks instance."""
+        assert len(self.blocks) == 1, "Only one group is supported"
         return [
-            block.block_id for block in self.blocks if block.block_hash is None
+            block.block_id for block in self.blocks[0]
+            if block.block_hash is None
         ]
 
 
@@ -63,12 +74,6 @@ class KVCacheManager:
         log_stats: bool = False,
         enable_kv_cache_events: bool = False,
     ) -> None:
-        assert len(kv_cache_config.kv_cache_groups) == 1, (
-            "KVCacheManager does not support hybrid models with more than 1 "
-            "kv cache group")
-        kv_cache_spec = kv_cache_config.kv_cache_groups[0].kv_cache_spec
-        self.block_size = kv_cache_spec.block_size
-        self.num_gpu_blocks = kv_cache_config.num_blocks
         self.max_model_len = max_model_len
 
         self.enable_caching = enable_caching
@@ -77,17 +82,24 @@ class KVCacheManager:
         self.log_stats = log_stats
         # FIXME: make prefix cache stats conditional on log_stats
         self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
-
-        self.block_pool = BlockPool(self.num_gpu_blocks, enable_caching,
-                                    enable_kv_cache_events)
-
-        self.single_type_manager = get_manager_for_kv_cache_spec(
-            kv_cache_spec=kv_cache_spec,
-            block_pool=self.block_pool,
+        assert len(
+            set(g.kv_cache_spec.block_size
+                for g in kv_cache_config.kv_cache_groups)
+        ) == 1, "Only one block size is supported for now"
+        self.block_size = kv_cache_config.kv_cache_groups[
+            0].kv_cache_spec.block_size
+
+        self.coordinator = get_kv_cache_coordinator(
+            kv_cache_config=kv_cache_config,
+            max_model_len=self.max_model_len,
             use_eagle=self.use_eagle,
-            num_kv_cache_groups=1,
+            enable_caching=enable_caching,
             caching_hash_fn=self.caching_hash_fn,
+            enable_kv_cache_events=enable_kv_cache_events,
         )
+        self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
+        self.block_pool = self.coordinator.block_pool
+        self.kv_cache_config = kv_cache_config
 
         # Mapping from request ID to kv block hashes.
         # This is to avoid recomputing the block hashes for each call of
@@ -133,7 +145,7 @@ class KVCacheManager:
         # When the request requires prompt logprobs, we skip prefix caching.
         if (not self.enable_caching
                 or request.sampling_params.prompt_logprobs is not None):
-            return KVCacheBlocks.create_empty(), 0
+            return self.create_empty_block_list(), 0
 
         # The block hashes for the request may already be computed
         # if the scheduler has tried to schedule the request before.
@@ -154,20 +166,16 @@ class KVCacheManager:
         # num_computed_tokens to be block-size aligned. Removing this limitation
         # could slightly improve performance in the future.
         max_cache_hit_length = request.num_tokens - 1
-
-        computed_blocks = self.single_type_manager.find_longest_cache_hit(
-            block_hashes, max_cache_hit_length)
-        # NOTE(woosuk): Since incomplete blocks are not eligible for
-        # sharing, `num_computed_tokens` is always a multiple of
-        # `block_size`.
-        num_computed_tokens = len(computed_blocks) * self.block_size
+        computed_blocks, num_new_computed_tokens = (
+            self.coordinator.find_longest_cache_hit(block_hashes,
+                                                    max_cache_hit_length))
 
         if self.log_stats:
             assert self.prefix_cache_stats is not None
             self.prefix_cache_stats.queries += request.num_tokens
-            self.prefix_cache_stats.hits += num_computed_tokens
+            self.prefix_cache_stats.hits += num_new_computed_tokens
 
-        return KVCacheBlocks(computed_blocks), num_computed_tokens
+        return KVCacheBlocks(computed_blocks), num_new_computed_tokens
 
     def allocate_slots(
         self,
@@ -220,7 +228,9 @@ class KVCacheManager:
         if new_computed_blocks is not None:
             new_computed_block_list = new_computed_blocks.blocks
         else:
-            new_computed_block_list = []
+            new_computed_block_list = [
+                [] for _ in range(len(self.kv_cache_config.kv_cache_groups))
+            ]
 
         # Free the blocks that are skipped during the attention computation
         # (e.g., tokens outside the sliding window).
@@ -228,8 +238,8 @@ class KVCacheManager:
         # insufficient free blocks.
         # Should call this function before allocating new blocks to reduce
         # the number of evicted blocks.
-        self.single_type_manager.remove_skipped_blocks(
-            request.request_id, request.num_computed_tokens)
+        self.coordinator.remove_skipped_blocks(request.request_id,
+                                               request.num_computed_tokens)
 
         # The number of computed tokens is the number of computed tokens plus
         # the new prefix caching hits
@@ -238,12 +248,12 @@ class KVCacheManager:
         num_tokens_need_slot = min(
             num_computed_tokens + num_new_tokens + num_lookahead_tokens,
             self.max_model_len)
-        num_blocks_to_allocate = (
-            self.single_type_manager.get_num_blocks_to_allocate(
-                request_id=request.request_id,
-                num_tokens=num_tokens_need_slot,
-                new_computed_blocks=new_computed_block_list,
-            ))
+
+        num_blocks_to_allocate = self.coordinator.get_num_blocks_to_allocate(
+            request_id=request.request_id,
+            num_tokens=num_tokens_need_slot,
+            new_computed_blocks=new_computed_block_list,
+        )
 
         if num_blocks_to_allocate > self.block_pool.get_num_free_blocks():
             # Cannot allocate new blocks
@@ -253,16 +263,16 @@ class KVCacheManager:
         if self.enable_caching:
             self.block_pool.touch(new_computed_block_list)
         else:
-            assert not new_computed_block_list, (
+            assert all(not blocks for blocks in new_computed_block_list), (
                 "Computed blocks should be empty when "
                 "prefix caching is disabled")
 
         # Append the new computed blocks to the request blocks until now to
         # avoid the case where the new blocks cannot be allocated.
-        self.single_type_manager.save_new_computed_blocks(
-            request.request_id, new_computed_block_list)
+        self.coordinator.save_new_computed_blocks(request.request_id,
+                                                  new_computed_block_list)
 
-        new_blocks = self.single_type_manager.allocate_new_blocks(
+        new_blocks = self.coordinator.allocate_new_blocks(
             request.request_id, num_tokens_need_slot)
 
         # P/D: delay caching blocks if we have to recv from
@@ -273,7 +283,7 @@ class KVCacheManager:
         # Speculated tokens might be rejected in the future, so we does
         # not cache any speculated tokens. We only cache blocks with
         # generated (accepted) tokens.
-        self.single_type_manager.cache_blocks(
+        self.coordinator.cache_blocks(
             request, self.req_to_block_hashes[request.request_id],
             num_computed_tokens + num_new_tokens - num_draft_tokens)
 
@@ -287,7 +297,7 @@ class KVCacheManager:
         Args:
             request: The request to free the blocks.
         """
-        self.single_type_manager.free(request.request_id)
+        self.coordinator.free(request.request_id)
 
     def reset_prefix_cache(self) -> bool:
         """Reset prefix cache. This function may be used in RLHF
@@ -345,10 +355,8 @@ class KVCacheManager:
             group.
         """
         assert request.status == RequestStatus.RUNNING
-        return [
-            self.single_type_manager.get_num_common_prefix_blocks(
-                request.request_id, num_running_requests)
-        ]
+        return self.coordinator.get_num_common_prefix_blocks(
+            request.request_id, num_running_requests)
 
     def free_block_hashes(self, request: Request) -> None:
         """Discard the block hashes for the request.
@@ -368,6 +376,15 @@ class KVCacheManager:
 
     def get_block_ids(self, request_id: str) -> list[list[int]]:
         """Get the block ids of a request."""
-        assert request_id in self.single_type_manager.req_to_blocks
-        return KVCacheBlocks(self.single_type_manager.req_to_blocks[request_id]
-                             ).get_block_ids()
+        return KVCacheBlocks(
+            self.coordinator.get_blocks(request_id)).get_block_ids()
+
+    def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
+                     num_computed_tokens: int) -> None:
+        """Cache the blocks for the request."""
+        self.coordinator.cache_blocks(request, block_hashes,
+                                      num_computed_tokens)
+
+    def create_empty_block_list(self) -> KVCacheBlocks:
+        """Creates a new KVCacheBlocks instance with no blocks."""
+        return KVCacheBlocks([[] for _ in range(self.num_kv_cache_groups)])
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index ad3c21f79..6d4bcfe64 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """KV-Cache Utilities."""
+
 import os
-from collections import deque
+from collections import defaultdict, deque
 from collections.abc import Iterable, Sequence
 from dataclasses import dataclass
 from typing import Any, Callable, NamedTuple, Optional
@@ -33,6 +34,18 @@ class BlockHash(NamedTuple):
     extra_keys: Optional[Any] = None
 
 
+class BlockHashWithGroupId(NamedTuple):
+    # The hash value for the contents (e.g., token_ids) of a block without group
+    # ID. The value is the same for blocks representing the same tokens but for
+    # different groups.
+    block_hash: BlockHash
+    # The KV cache group ID.
+    group_id: int
+
+    def get_hash_value(self) -> int:
+        return self.block_hash.hash_value
+
+
 # The hash seed for the first block of the prefix block sequence.
 #
 # Even if the hash function is the builtin hash(), we use sha256 to generate
@@ -44,7 +57,7 @@ class BlockHash(NamedTuple):
 # This aligns with the behavior of Python's hash() function, which also uses
 # a random seed if PYTHONHASHSEED is not set.
 NONE_HASH = int.from_bytes(os.urandom(32), byteorder="big") if os.getenv(
-    'PYTHONHASHSEED') is None else sha256(os.getenv('PYTHONHASHSEED'))
+    "PYTHONHASHSEED") is None else sha256(os.getenv("PYTHONHASHSEED"))
 
 
 class PrefixCachingMetrics:
@@ -118,7 +131,7 @@ class KVCacheBlock:
     ref_cnt: int = 0
     # The hash of the block composed of (block hash, tuple of token IDs).
     # It is only available when the block is full.
-    _block_hash: Optional[BlockHash] = None
+    _block_hash: Optional[BlockHashWithGroupId] = None
 
     # Used to construct a doubly linked list for free blocks.
     # These two attributes should only be manipulated by FreeKVCacheBlockQueue.
@@ -135,11 +148,11 @@ class KVCacheBlock:
         self.ref_cnt -= 1
 
     @property
-    def block_hash(self) -> Optional[BlockHash]:
+    def block_hash(self) -> Optional[BlockHashWithGroupId]:
         return self._block_hash
 
     @block_hash.setter
-    def block_hash(self, block_hash: BlockHash):
+    def block_hash(self, block_hash: BlockHashWithGroupId):
         assert self.block_hash is None, (
             "The block already has a hash. This should not happen.")
         self._block_hash = block_hash
@@ -151,10 +164,10 @@ class KVCacheBlock:
     def __repr__(self) -> str:
         # Use block_id instead of KVCacheBlock object to avoid calling __repr__
         # on KVCacheBlock object recursively.
-        prev_block_id = self.prev_free_block.block_id \
-            if self.prev_free_block else None
-        next_block_id = self.next_free_block.block_id \
-            if self.next_free_block else None
+        prev_block_id = (self.prev_free_block.block_id
+                         if self.prev_free_block else None)
+        next_block_id = (self.next_free_block.block_id
+                         if self.next_free_block else None)
         return (f"KVCacheBlock(block_id={self.block_id}, "
                 f"ref_cnt={self.ref_cnt}, "
                 f"_block_hash={self._block_hash}, "
@@ -570,20 +583,20 @@ def create_kv_cache_group_specs(
         kv_cache_spec: dict[str, KVCacheSpec],
         grouped_layer_names: list[list[str]]) -> list[KVCacheGroupSpec]:
     """
-     Create KVCacheGroupSpec object for each kv cache group layer.
-     The layers in the same group should share the same
-     KVCacheSpec.
-
-     Args:
-         kv_cache_spec:
-             A mapping from each layer name to its corresponding KVCacheSpec.
-         grouped_layer_names:
-             A list of kv cache groups, where each element is a list of layer
-             names that belong to the same group and should share the same
-             KVCacheSpec.
-     Returns:
-         A list of KVCacheGroupSpec objects, one for each group.
-     """
+    Create KVCacheGroupSpec object for each kv cache group layer.
+    The layers in the same group should share the same
+    KVCacheSpec.
+
+    Args:
+        kv_cache_spec:
+            A mapping from each layer name to its corresponding KVCacheSpec.
+        grouped_layer_names:
+            A list of kv cache groups, where each element is a list of layer
+            names that belong to the same group and should share the same
+            KVCacheSpec.
+    Returns:
+        A list of KVCacheGroupSpec objects, one for each group.
+    """
     kv_cache_groups = []
     for layer_names_one_group in grouped_layer_names:
         layer_specs = [
@@ -628,6 +641,37 @@ def get_max_concurrency_for_kv_cache_config(
     return max_concurrency
 
 
+def get_num_blocks(vllm_config: VllmConfig, num_layers: int,
+                   available_memory: int, page_size: int) -> int:
+    """
+    Get the number of kv cache blocks.
+
+    Args:
+        vllm_config: The global VllmConfig
+        num_layers: The number of layers
+        available_memory: Memory available for KV cache in bytes.
+        page_size: The page size of the KV cache.
+    """
+    num_blocks = int(available_memory // page_size // num_layers)
+    num_blocks = max(num_blocks, 0)
+    if vllm_config.cache_config.num_gpu_blocks_override is not None:
+        num_gpu_blocks_override = \
+            vllm_config.cache_config.num_gpu_blocks_override
+        logger.info(
+            "Overriding num_gpu_blocks=%d with "
+            "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
+    return num_blocks
+
+
+def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int:
+    """
+    Get the page size of the KV cache.
+    """
+    page_sizes = set(layer.page_size_bytes for layer in kv_cache_spec.values())
+    assert len(page_sizes) == 1
+    return page_sizes.pop()
+
+
 def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
                                       kv_cache_spec: dict[str, KVCacheSpec],
                                       available_memory: int) -> KVCacheConfig:
@@ -644,32 +688,24 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
         The generated KVCacheConfig
     """
 
-    page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
-    assert len(page_sizes) == 1
-    page_size = page_sizes.pop()
-
-    num_blocks = int(available_memory // page_size // len(kv_cache_spec))
-    num_blocks = max(num_blocks, 0)
-
-    if vllm_config.cache_config.num_gpu_blocks_override is not None:
-        num_gpu_blocks_override = \
-            vllm_config.cache_config.num_gpu_blocks_override
-        logger.info(
-            "Overriding num_gpu_blocks=%d with "
-            "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
-        num_blocks = num_gpu_blocks_override
+    page_size = get_uniform_page_size(kv_cache_spec)
+    num_blocks = get_num_blocks(vllm_config, len(kv_cache_spec),
+                                available_memory, page_size)
 
     per_layer_size = page_size * num_blocks
     # All layers have the same KV cache spec, so we create one kv cache group
     # for all layers.
     grouped_layer_names = [list(kv_cache_spec.keys())]
 
+    # Each layer uses a separate Tensor to store its KV cache.
+    kv_cache_tensors = [
+        KVCacheTensor(size=per_layer_size, shared_by=[layer_name])
+        for layer_name in kv_cache_spec
+    ]
+
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,
-        tensors={
-            layer_name: KVCacheTensor(size=per_layer_size)
-            for layer_name in kv_cache_spec
-        },
+        kv_cache_tensors=kv_cache_tensors,
         kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec,
                                                     grouped_layer_names),
     )
@@ -685,17 +721,185 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
     return kv_cache_config
 
 
+def is_kv_cache_page_size_uniform(
+        kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
+    """
+    Whether all layers in the given KVCacheSpec have the same page size.
+    Args:
+        kv_cache_spec: The KVCacheSpec of each attention layer in the model
+
+    Returns:
+        True if all layers have the same page size, False otherwise.
+    """
+
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
+    return len(page_sizes) == 1
+
+
+def _get_kv_cache_config_uniform_page_size(
+        vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
+        available_memory: int) -> KVCacheConfig:
+    """
+    Generates the KV cache configuration for hybrid models with multiple 
+    attention types but still with a uniform page size (physical memory per 
+    block per layer) for all layers.
+
+    Detailed explanation about kv cache management of hybrid models:
+    The layers in the models are repeated with some patterns, e.g., a model
+    with 10 full attention layers and 20 sliding window attention layers can be
+    regarded as repeating the pattern (1 * full, 2 * sw) 10 times. 
+    The KVCacheManager allocates different block tables for each of the 3 layers
+    in the pattern, and repeats each of them 10 times to generate the 
+    block_table for the 30 layers in the model.
+    Therefore, we can group the layers in the model into 3 kv_cache_groups, each
+    of which contains 10 layers in the model.
+    The KVCacheManager allocates the block_table for each group based on its
+    kv_cache spec, and the model runner applies the block table to each layer 
+    in the group.
+    For example:
+    1. A model only uses full attention. The pattern is 
+    (num_hidden_layers * full), so there is only one group and the block table 
+    is shared by all layers. It is already handled by 
+    `_get_kv_cache_config_uniform_type`.
+    2. A model with 10 full attention layers and 20 sliding window 
+    attention layers. There are 3 layers in the pattern (1 * full, 2 * sw), so 
+    there are 3 kv_cache_groups, each of which represents 10 layers.
+
+    To simplify the implementation, we make the following assumptions:
+    1. Physical memory per block: Must be the same across all KV cache groups. 
+    Breaking this assumption is non-trivial due to memory fragmentation concerns
+    when allocating blocks of different sizes.
+    2. Tokens per block (block_size): Currently, we directly use 
+    `CacheConfig.block_size` for all layers. It can be extended to vary by KV 
+    cache group, but within each KV cache group, all layers must share the same 
+    block size.
+    3. Physical memory per token per layer: This property is decided by model 
+    config. Currently we only support models that have the same physical memory 
+    per token per layer for all layers. Can be relaxed with a simple extension, 
+    but still need to keep physical memory per block the same for all groups.
+    4. Number of layers per group: Currently assumed the same for all layers. 
+    Can be relaxed with a simple extension, but still need to keep physical 
+    memory per block the same for all groups.
+    5. Attention type within groups: All layers in a group must share the same
+    attention type. One exception is that, when 
+    `--disable-hybrid-kv-cache-manager` is true, the single group for full 
+    attention layers may also include attention layers using sliding window or 
+    LLaMA 4 local attention. See `unify_hybrid_kv_cache_specs` for more details.
+    6. Support for multiple attention types: The design for most components is 
+    general to an arbitrary number of attention types. But 
+    `find_longest_cache_hit` only supports one attention type or two 
+    types of full-attention plus exactly one another type. The general
+    implementation of this function is feasible but we don't know how to 
+    implement it cleanly yet.
+
+    As we assume tokens per block, physical memory per token per layer, and 
+    number of layers per group are the same now, we can ensure that physical 
+    memory per block is the same for all groups.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The KVCacheSpec of each attention layer in the model
+        available_memory: Memory available for KV cache in bytes.
+    Returns:
+        The generated KVCacheConfig
+    """
+    # Group all layers by type_id.
+    # E.g., 2 full attention layers and 3 sliding window attention layers,
+    # -> (full.0, full.1), (sw.0, sw.1, sw.2).
+    same_type_layers: dict[str, list[str]] = defaultdict(list)
+    for layer_name, layer_spec in kv_cache_spec.items():
+        same_type_layers[layer_spec.type_id].append(layer_name)
+
+    # Split each group into smaller groups, to make the number of layers in each
+    # group identical. Add padding to the last group of each type if necessary.
+    # E.g., (full.0, full.1), (sw.0, sw.1, sw.2)
+    # split to 3 groups with 2 layers each:
+    # (full.0, full.1), (sw.0, sw.1), (sw.2, padding).
+    # FIXME(Chen): At the moment of writing this code (2025-06-02), all
+    # open-source hybrid model follows a n:1 pattern between different attention
+    # types (e.g., Gemma3 5:1 between sw and full, LLaMA4 3:1 between local and
+    # full), so we can use the "1" in the n:1 pattern as the group size, which
+    # is the minimum number of layers among all attention types. Need a better
+    # strategy if we want to support more complex patterns (e.g., 20 full + 30
+    # sw, where the group size should be 10).
+    group_size = min([len(layers) for layers in same_type_layers.values()])
+    grouped_layers = []
+    for layers in same_type_layers.values():
+        num_padding_layers = group_size - len(layers) % group_size
+        if num_padding_layers != group_size:
+            logger.warning(
+                "Add %d padding layers, may waste at most %.2f%% KV cache memory",  # noqa
+                num_padding_layers,
+                num_padding_layers / len(layers) * 100,
+            )
+        for i in range(0, len(layers), group_size):
+            grouped_layers.append(layers[i:i + group_size])
+    kv_cache_groups = create_kv_cache_group_specs(kv_cache_spec,
+                                                  grouped_layers)
+
+    # Determine how model runners should initialize the KV cache tensors.
+    # We will have group_size memory pools, each is shared by one layer from
+    # each group. As layers of different groups have different block table,
+    # they will use different parts of the shared Tensor.
+    # The memory layout in the example will be:
+    # full.0, sw.0, sw.2: share a Tensor with size=available_memory//2
+    # full.1, sw.1: share another Tensor with size=available_memory//2
+    page_size = get_uniform_page_size(kv_cache_spec)
+    num_blocks = get_num_blocks(vllm_config, group_size, available_memory,
+                                page_size)
+    per_memory_pool_size = page_size * num_blocks
+    kv_cache_tensors = []
+    for i in range(group_size):
+        shared_by = []
+        for j in range(len(kv_cache_groups)):
+            if i < len(grouped_layers[j]):
+                shared_by.append(grouped_layers[j][i])
+        kv_cache_tensors.append(
+            KVCacheTensor(size=per_memory_pool_size, shared_by=shared_by))
+
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=kv_cache_tensors,
+        kv_cache_groups=kv_cache_groups,
+    )
+
+    # Print the KV cache size and maximum concurrency.
+    num_tokens = num_blocks // len(
+        grouped_layers) * vllm_config.cache_config.block_size
+    num_tokens_str = f"{num_tokens:,}"
+    logger.info("GPU KV cache size: %s tokens", num_tokens_str)
+    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
+    max_concurrency = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config)
+    logger.info("Maximum concurrency for %s tokens per request: %.2fx",
+                max_model_len_str, max_concurrency)
+    return kv_cache_config
+
+
 def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
     """
-    Only models with one type of KV cache are supported yet. This function tries
-    to convert the KV cache specs to one type if the model is a hybrid model
-    with multiple type of KV cache. It will convert all SlidingWindowSpec to
-    FullAttentionSpec if both types are present.
+    This function tries to convert the KV cache specs to one type if the model
+    is a hybrid model with multiple type of KV cache. It will convert all
+    SlidingWindowSpec to FullAttentionSpec if both types are present.
 
     Args:
         kv_cache_spec: The kv cache spec of each attention layer in the model
     """
 
+    def is_hybrid(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
+        type_ids = set(layer_spec.type_id
+                       for layer_spec in kv_cache_spec.values())
+        return len(type_ids) > 1
+
+    if not is_hybrid(kv_cache_spec):
+        return
+
+    logger.warning(
+        "Hybrid KV cache manager is disabled for this hybrid model, "
+        "This means we do not enable any optimizations for saving KV cache "
+        "memory (e.g., dropping the KV cache outside the sliding window). "
+        "The compute of layers like sliding window is still saved.")
+
     has_full_attention = any(
         isinstance(spec, FullAttentionSpec) for spec in kv_cache_spec.values())
     has_sliding_window = any(
@@ -712,13 +916,18 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
                     sliding_window=spec.sliding_window,
                 )
 
+    if is_hybrid(kv_cache_spec):
+        raise ValueError("Hybrid KV cache manager is disabled but failed to "
+                         "convert the KV cache specs to one unified type.")
+
 
-def get_kv_cache_config(vllm_config: VllmConfig,
-                        kv_cache_spec: dict[str, KVCacheSpec],
-                        available_memory: int) -> KVCacheConfig:
+def get_kv_cache_config(
+    vllm_config: VllmConfig,
+    kv_cache_spec: dict[str, KVCacheSpec],
+    available_memory: int,
+) -> KVCacheConfig:
     """
-    Generates the KV cache configuration for a model
-    TODO: support hybrid models with more than one type of KV cache.
+    Generates the KV cache configuration for a model.
 
     Args:
         vllm_config: The global VllmConfig
@@ -728,14 +937,25 @@ def get_kv_cache_config(vllm_config: VllmConfig,
     Returns:
         The generated KVCacheConfigs
     """
-    unify_hybrid_kv_cache_specs(kv_cache_spec)
     check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
+
+    if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager:
+        unify_hybrid_kv_cache_specs(kv_cache_spec)
+
     if is_kv_cache_type_uniform(kv_cache_spec):
         # KV cache of all layers are the same, which is true for
         # most models. Allocate the same amount of memory for
         # each layer.
         return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec,
                                                  available_memory)
+    elif is_kv_cache_page_size_uniform(kv_cache_spec):
+        # Model contains multiple attention types, but KV cache of all layers
+        # have the same physical memory per block per layer. Split the layers
+        # into groups with the same number of layers, and thus same total page
+        # size.
+        return _get_kv_cache_config_uniform_page_size(vllm_config,
+                                                      kv_cache_spec,
+                                                      available_memory)
 
     raise NotImplementedError
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 6a4c6bcf4..7347f8e46 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -18,7 +18,7 @@ from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
                                                 compute_encoder_budget)
-from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager
+from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
@@ -377,7 +377,8 @@ class Scheduler(SchedulerInterface):
                 # KVTransfer: WAITING reqs have num_computed_tokens > 0
                 # after async KV recvs are completed.
                 else:
-                    new_computed_blocks = KVCacheBlocks.create_empty()
+                    new_computed_blocks = (
+                        self.kv_cache_manager.create_empty_block_list())
                     num_new_local_computed_tokens = 0
                     num_computed_tokens = request.num_computed_tokens
 
@@ -1010,7 +1011,7 @@ class Scheduler(SchedulerInterface):
         num_computed_tokens = len(block_ids) * self.block_size
         if num_computed_tokens == request.num_tokens:
             num_computed_tokens -= 1
-        self.kv_cache_manager.single_type_manager.cache_blocks(
+        self.kv_cache_manager.cache_blocks(
             request,
             self.kv_cache_manager.req_to_block_hashes[request.request_id],
             num_computed_tokens,
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index a529cde09..98d758f82 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -22,8 +22,7 @@ class SingleTypeKVCacheManager(ABC):
         self,
         kv_cache_spec: KVCacheSpec,
         block_pool: BlockPool,
-        use_eagle: bool,
-        num_kv_cache_groups: int,
+        kv_cache_group_id: int,
         caching_hash_fn: Callable,
     ) -> None:
         """
@@ -31,9 +30,7 @@ class SingleTypeKVCacheManager(ABC):
         Args:
             kv_cache_spec: The kv_cache_spec for this manager.
             block_pool: The block pool.
-            use_eagle: Whether to use eagle.
-            num_kv_cache_groups: The number of kv cache groups managed by this 
-                manager.
+            kv_cache_group_id: The id of the kv cache group of this manager.
             caching_hash_fn: The caching hash function.
         """
 
@@ -41,9 +38,6 @@ class SingleTypeKVCacheManager(ABC):
         self.kv_cache_spec = kv_cache_spec
         self.block_pool = block_pool
 
-        # Needs special handling for find_longest_cache_hit if eagle is enabled
-        self.use_eagle = use_eagle
-
         # Mapping from request ID to blocks to track the blocks allocated
         # for each request, so that we can free the blocks when the request
         # is finished.
@@ -56,8 +50,8 @@ class SingleTypeKVCacheManager(ABC):
         # data for reempted ones.
         self.num_cached_block: dict[str, int] = {}
 
-        self.num_kv_cache_groups = num_kv_cache_groups
         self.caching_hash_fn = caching_hash_fn
+        self.kv_cache_group_id = kv_cache_group_id
 
     def get_num_blocks_to_allocate(
             self, request_id: str, num_tokens: int,
@@ -86,8 +80,7 @@ class SingleTypeKVCacheManager(ABC):
         num_evictable_computed_blocks = sum(
             blk.ref_cnt == 0 and not blk.is_null
             for blk in new_computed_blocks)
-        return ((num_new_blocks + num_evictable_computed_blocks) *
-                self.num_kv_cache_groups)
+        return num_new_blocks + num_evictable_computed_blocks
 
     def save_new_computed_blocks(
             self, request_id: str,
@@ -130,8 +123,7 @@ class SingleTypeKVCacheManager(ABC):
         if num_new_blocks <= 0:
             return []
         else:
-            new_blocks = self.block_pool.get_new_blocks(
-                num_new_blocks * self.num_kv_cache_groups)
+            new_blocks = self.block_pool.get_new_blocks(num_new_blocks)
             req_blocks.extend(new_blocks)
             return new_blocks
 
@@ -156,12 +148,19 @@ class SingleTypeKVCacheManager(ABC):
             num_cached_blocks=num_cached_blocks,
             num_full_blocks=num_full_blocks,
             block_size=self.block_size,
+            kv_cache_group_id=self.kv_cache_group_id,
             hash_fn=self.caching_hash_fn,
         )
 
         self.num_cached_block[request.request_id] = num_full_blocks
 
     def free(self, request_id: str) -> None:
+        """
+        Free the blocks for the request.
+
+        Args:
+            request_id: The request ID.
+        """
         # Default to [] in case a request is freed (aborted) before alloc.
         req_blocks = self.req_to_blocks.pop(request_id, [])
 
@@ -188,12 +187,22 @@ class SingleTypeKVCacheManager(ABC):
 
         raise NotImplementedError
 
+    @classmethod
     @abstractmethod
-    def find_longest_cache_hit(self, block_hashes: list[BlockHash],
-                               max_length: int) -> list[KVCacheBlock]:
+    def find_longest_cache_hit(
+        cls,
+        block_hashes: list[BlockHash],
+        max_length: int,
+        kv_cache_group_ids: list[int],
+        block_pool: BlockPool,
+        kv_cache_spec: KVCacheSpec,
+        use_eagle: bool,
+    ) -> list[list[KVCacheBlock]]:
         """
         Get the longest cache hit prefix of the blocks that is not longer than 
-        `max_length`. If no cache hit is found, return an empty list. 
+        `max_length`. The prefix should be a common prefix hit for all the 
+        kv cache groups in `kv_cache_group_ids`. If no cache hit is found, 
+        return an empty list. 
         If eagle is enabled, drop the last matched block to force recompute the 
         last block to get the required hidden states for eagle drafting head. 
         Need to be customized for each attention type.
@@ -201,12 +210,20 @@ class SingleTypeKVCacheManager(ABC):
         Args:
             block_hashes: The block hashes of the request.
             max_length: The maximum length of the cache hit prefix.
+            kv_cache_group_ids: The ids of the kv cache groups.
+            block_pool: The block pool.
+            kv_cache_spec: The kv cache spec.
+            use_eagle: Whether to use eagle.
 
         Returns:
-            A list of cached blocks with skipped blocks replaced by null block.
+            A list of cached blocks with skipped blocks replaced by null block
+            for each kv cache group in `kv_cache_group_ids`.
+            Return a list of length `len(kv_cache_group_ids)`, where the i-th
+            element is a list of cached blocks for the i-th kv cache group
+            in `kv_cache_group_ids`.
             For example, sliding window manager should return a list like
-            [NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)] for block size 4 and 
-            sliding window 8. 
+            [[NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)]] for block size 4 
+            and sliding window 8 and len(kv_cache_group_ids) = 1.
         """
 
         raise NotImplementedError
@@ -215,11 +232,9 @@ class SingleTypeKVCacheManager(ABC):
     def remove_skipped_blocks(self, request_id: str,
                               num_computed_tokens: int) -> None:
         """
-        Remove the blocks that are no longer needed from `blocks`. The removed 
-        blocks should be replaced by null_block. Return the removed blocks in 
-        eviction order, where the first returned block should be evicted first.
-        Don't free the removed blocks in this function. Need to be customized 
-        for each attention type.
+        Remove the blocks that are no longer needed from `blocks` and free the 
+        blocks. The removed blocks should be replaced by null_block.
+        Need to be customized for each attention type.
 
         Args:
             request_id: The request ID.
@@ -230,21 +245,36 @@ class SingleTypeKVCacheManager(ABC):
 
 class FullAttentionManager(SingleTypeKVCacheManager):
 
-    def find_longest_cache_hit(self, block_hashes: list[BlockHash],
-                               max_length: int) -> list[KVCacheBlock]:
-        computed_blocks: list[KVCacheBlock] = []
-        max_num_blocks = max_length // self.block_size
+    @classmethod
+    def find_longest_cache_hit(
+        cls,
+        block_hashes: list[BlockHash],
+        max_length: int,
+        kv_cache_group_ids: list[int],
+        block_pool: BlockPool,
+        kv_cache_spec: KVCacheSpec,
+        use_eagle: bool,
+    ) -> list[list[KVCacheBlock]]:
+        assert isinstance(kv_cache_spec, FullAttentionSpec), (
+            "FullAttentionManager can only be used for full attention groups")
+        computed_blocks: list[list[KVCacheBlock]] = [
+            [] for _ in range(len(kv_cache_group_ids))
+        ]
+        max_num_blocks = max_length // kv_cache_spec.block_size
         for i in range(max_num_blocks):
             block_hash = block_hashes[i]
             # block_hashes is a chain of block hashes. If a block hash is not
             # in the cached_block_hash_to_id, the following block hashes are
             # not computed yet for sure.
-            if cached_block := self.block_pool.get_cached_block(block_hash):
-                computed_blocks.append(cached_block)
+            if cached_block := block_pool.get_cached_block(
+                    block_hash, kv_cache_group_ids):
+                for j in range(len(kv_cache_group_ids)):
+                    computed_blocks[j].append(cached_block[j])
             else:
                 break
-        if self.use_eagle and len(computed_blocks) > 0:
-            computed_blocks.pop()
+        if use_eagle and len(computed_blocks[0]) > 0:
+            for j in range(len(kv_cache_group_ids)):
+                computed_blocks[j].pop()
         return computed_blocks
 
     def remove_skipped_blocks(self, request_id: str,
@@ -267,45 +297,58 @@ class FullAttentionManager(SingleTypeKVCacheManager):
 class SlidingWindowManager(SingleTypeKVCacheManager):
 
     def __init__(self, kv_cache_spec: SlidingWindowSpec, block_pool: BlockPool,
-                 use_eagle: bool, **kwargs) -> None:
-        super().__init__(kv_cache_spec, block_pool, use_eagle, **kwargs)
+                 **kwargs) -> None:
+        super().__init__(kv_cache_spec, block_pool, **kwargs)
         self.sliding_window = kv_cache_spec.sliding_window
+        self._null_block = block_pool.null_block
+
+    @classmethod
+    def find_longest_cache_hit(
+        cls,
+        block_hashes: list[BlockHash],
+        max_length: int,
+        kv_cache_group_ids: list[int],
+        block_pool: BlockPool,
+        kv_cache_spec: KVCacheSpec,
+        use_eagle: bool,
+    ) -> list[list[KVCacheBlock]]:
+        assert isinstance(kv_cache_spec, SlidingWindowSpec), (
+            "SlidingWindowManager can only be used for sliding window groups")
+
         # The number of contiguous blocks needed for prefix cache hit.
         # -1 since the input token itself is also included in the window
-        self.sliding_window_contiguous_blocks = cdiv(
-            (kv_cache_spec.sliding_window - 1), self.block_size)
-        if self.use_eagle:
+        sliding_window_contiguous_blocks = cdiv(
+            kv_cache_spec.sliding_window - 1, kv_cache_spec.block_size)
+        if use_eagle:
             # Need to drop the last matched block if eagle is enabled. For
             # sliding window layer, we achieve this by increasing the number of
             # contiguous blocks needed for prefix cache hit by one and dropping
             # the last matched block.
-            self.sliding_window_contiguous_blocks += 1
-        self._null_block = block_pool.null_block
+            sliding_window_contiguous_blocks += 1
 
-    def find_longest_cache_hit(self, block_hashes: list[BlockHash],
-                               max_length: int) -> list[KVCacheBlock]:
         # TODO: reduce i by sliding_window_contiguous_blocks when cache miss, to
         # optimize the time complexity from O(max_num_blocks) to
         # O(max_num_blocks / sliding_window_contiguous_blocks +
         # sliding_window_contiguous_blocks),
         # which is good for low cache hit rate scenarios.
-        max_num_blocks = max_length // self.block_size
-        computed_blocks = [self._null_block] * max_num_blocks
+        max_num_blocks = max_length // kv_cache_spec.block_size
+        computed_blocks = [[block_pool.null_block] * max_num_blocks
+                           for _ in range(len(kv_cache_group_ids))]
         num_contiguous_blocks = 0
-
         match_found = False
         # Search from right to left and early stop when a match is found.
         for i in range(max_num_blocks - 1, -1, -1):
-            if cached_block := self.block_pool.get_cached_block(
-                    block_hashes[i]):
-                computed_blocks[i] = cached_block
+            if cached_block := block_pool.get_cached_block(
+                    block_hashes[i], kv_cache_group_ids):
+                for j in range(len(kv_cache_group_ids)):
+                    computed_blocks[j][i] = cached_block[j]
                 num_contiguous_blocks += 1
-                if (num_contiguous_blocks
-                        >= self.sliding_window_contiguous_blocks):
+                if (num_contiguous_blocks >= sliding_window_contiguous_blocks):
                     # Trim the trailing blocks.
                     # E.g., [NULL, NULL, 8, 3, NULL, 9] -> [NULL, NULL, 8, 3]
                     # when sliding_window_contiguous_blocks=2.
-                    del computed_blocks[i + num_contiguous_blocks:]
+                    for j in range(len(kv_cache_group_ids)):
+                        del computed_blocks[j][i + num_contiguous_blocks:]
                     match_found = True
                     break
             else:
@@ -313,9 +356,11 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
         if not match_found:
             # The first `num_contiguous_blocks` is a cache hit even if
             # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
-            del computed_blocks[num_contiguous_blocks:]
-        if self.use_eagle and len(computed_blocks) > 0:
-            computed_blocks.pop()
+            for j in range(len(kv_cache_group_ids)):
+                del computed_blocks[j][num_contiguous_blocks:]
+        if use_eagle and len(computed_blocks[0]) > 0:
+            for j in range(len(kv_cache_group_ids)):
+                computed_blocks[j].pop()
         return computed_blocks
 
     def remove_skipped_blocks(self, request_id: str,
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index cf2eb3b95..e938f3bfc 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -157,11 +157,10 @@ class SlidingWindowSpec(AttentionSpec):
 @dataclass
 class KVCacheTensor:
     """
-    A dataclass for specifying how the workers should initialize the KV cache
-    for a layer. Only contains the size of KV cache for that layer for now. Will
-    be extended to support multiple layers sharing the same memory pool.
+    A class for specifying how the workers should initialize the KV cache.
     """
-    size: int  # The size of KV cache Tensor in bytes
+    size: int  # size of the KV cache tensor in bytes
+    shared_by: list[str]  # layer names that share the same KV cache tensor
 
 
 @dataclass
@@ -183,27 +182,13 @@ class KVCacheConfig:
     """
     """The number of KV cache blocks"""
     num_blocks: int
-    """layer_name -> how to initialize KV cache for that layer"""
-    tensors: dict[str, KVCacheTensor]
+    """How should model runner initialize the KV cache tensors for each layer"""
+    kv_cache_tensors: list[KVCacheTensor]
     """
     The kv cache groups of the model.
-    The layers in the models are repeated with some patterns, e.g., a model
-    with 10 full attention layers and 20 sliding window attention layers can be
-    regarded as repeating the pattern (1 * full, 2 * sw) 10 times. 
-    The KVCacheManager allocates different block tables for each of the 3 layers
-    in the pattern, and repeats each of them 10 times to generate the 
-    block_table for the 30 layers in the model.
-    Therefore, we can group the layers in the model into 3 groups, each of which
-    contains 10 layers in the model.
-    The KVCacheManager allocates the block_table for each group based on its
-    kv_cache spec, and the model runner applies the block table to each layer 
-    in the group.
-    For example:
-    1. A model only uses full attention. The pattern is 
-    (num_hidden_layers * full), so there is only one group and the block table 
-    is shared by all layers.
-    2. (WIP) A model with 10 full attention layers and 20 sliding window 
-    attention layers. There are 3 layers in the pattern (1 * full, 2 * sw), so 
-    there are 3 groups, each of which represents 10 layers in the model.
+    For models with only one type of attention, there is only one group that
+    contains all layers.
+    For models with multiple types of attention, there will be multiple groups,
+    see `_get_kv_cache_config_uniform_page_size` for more details.
     """
     kv_cache_groups: list[KVCacheGroupSpec]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f6ccf0fa1..a90c294a9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2088,33 +2088,58 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 block_sizes=block_sizes,
             )
 
-    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
+    def _allocate_kv_cache_tensors(
+            self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
         """
-        Initialize KV cache based on `kv_cache_config`.
+        Initializes the KV cache buffer with the correct size. The buffer needs
+        to be reshaped to the desired shape before being used by the models.
+
         Args:
-            kv_cache_config: Configuration for the KV cache, including the KV
-            cache size of each layer
+            kv_cache_config: The KV cache config 
+        Returns:
+            dict[str, torch.Tensor]: A map between layer names to their 
+            corresponding memory buffer for KV cache.
+         """
+        kv_cache_raw_tensors: dict[str, torch.Tensor] = {}
+        for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+            tensor = torch.zeros(kv_cache_tensor.size,
+                                 dtype=torch.int8,
+                                 device=self.device)
+            for layer_name in kv_cache_tensor.shared_by:
+                kv_cache_raw_tensors[layer_name] = tensor
+
+        layer_names = set()
+        for group in kv_cache_config.kv_cache_groups:
+            layer_names.update(group.layer_names)
+        assert layer_names == set(kv_cache_raw_tensors.keys(
+        )), "Some layers are not correctly initialized"
+        return kv_cache_raw_tensors
+
+    def _reshape_kv_cache_tensors(
+        self,
+        kv_cache_config: KVCacheConfig,
+        kv_cache_raw_tensors: dict[str, torch.Tensor],
+    ) -> dict[str, torch.Tensor]:
         """
-        self.kv_cache_config = kv_cache_config
-        self.may_reinitialize_input_batch(kv_cache_config)
-        self.initialize_attn_backend(kv_cache_config)
+        Reshape the KV cache tensors to the desired shape and dtype.
 
+        Args:
+            kv_cache_config: The KV cache config 
+            kv_cache_raw_tensors: The KV cache buffer of each layer, with 
+            correct size but uninitialized shape.
+        Returns:
+            Dict[str, torch.Tensor]: A map between layer names to their 
+            corresponding memory buffer for KV cache.
+        """
         kv_caches: dict[str, torch.Tensor] = {}
-
-        for i, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
-            kv_cache_spec = kv_cache_group.kv_cache_spec
-            for layer_name in kv_cache_group.layer_names:
-                tensor_config = kv_cache_config.tensors[layer_name]
-                assert tensor_config.size % kv_cache_spec.page_size_bytes == 0
-                num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes
-                # `num_blocks` is the number of blocks the model runner can use.
-                # `kv_cache_config.num_blocks` is the number of blocks that
-                # KVCacheManager may allocate.
-                # Since different GPUs may have different number of layers and
-                # different memory capacities, `num_blocks` can be different on
-                # different GPUs, and `kv_cache_config.num_blocks` is set to
-                # the min of all `num_blocks`. Verify it here.
-                assert num_blocks >= kv_cache_config.num_blocks
+        for i, kv_cache_group_spec in enumerate(
+                kv_cache_config.kv_cache_groups):
+            kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+            for layer_name in kv_cache_group_spec.layer_names:
+                raw_tensor = kv_cache_raw_tensors[layer_name]
+                assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
+                num_blocks = (raw_tensor.numel() //
+                              kv_cache_spec.page_size_bytes)
                 if isinstance(kv_cache_spec, AttentionSpec):
                     kv_cache_shape = self.attn_backends[i].get_kv_cache_shape(
                         num_blocks, kv_cache_spec.block_size,
@@ -2140,13 +2165,29 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                         kv_cache_stride_order.index(i)
                         for i in range(len(kv_cache_stride_order))
                     ]
-                    kv_caches[layer_name] = torch.zeros(
-                        kv_cache_shape, dtype=dtype,
-                        device=self.device).permute(*inv_order)
+                    kv_caches[layer_name] = kv_cache_raw_tensors[
+                        layer_name].view(dtype).view(kv_cache_shape).permute(
+                            *inv_order)
                 else:
-                    # TODO: add new branches when introducing more types of
-                    # KV cache specs.
-                    raise ValueError("Unknown KV cache spec type.")
+                    raise NotImplementedError
+        return kv_caches
+
+    def initialize_kv_cache_tensors(
+            self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
+        """
+        Initialize the memory buffer for KV cache.
+
+        Args:
+            kv_cache_config: The KV cache config
+        Returns:
+            Dict[str, torch.Tensor]: A map between layer names to their 
+            corresponding memory buffer for KV cache.
+        """
+        # Initialize the memory buffer for KV cache
+        kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config)
+        # Change the memory buffer to the desired shape
+        kv_caches = self._reshape_kv_cache_tensors(kv_cache_config,
+                                                   kv_cache_raw_tensors)
 
         # Setup `kv_cache_config` and `kv_caches` for models
         # with cross-layer KV sharing
@@ -2157,17 +2198,30 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 kv_caches,
             )
 
+        bind_kv_cache(
+            kv_caches,
+            self.vllm_config.compilation_config.static_forward_context,
+            self.kv_caches)
+        return kv_caches
+
+    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
+        """
+        Initialize KV cache based on `kv_cache_config`.
+        Args:
+            kv_cache_config: Configuration for the KV cache, including the KV
+            cache size of each layer
+        """
+        self.kv_cache_config = kv_cache_config
+        self.may_reinitialize_input_batch(kv_cache_config)
+        self.initialize_attn_backend(kv_cache_config)
+        kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
+
         if self.speculative_config and self.speculative_config.use_eagle():
             assert isinstance(self.drafter, EagleProposer)
             # validate all draft model layers belong to the same kv cache
             # group
             self.drafter.validate_same_kv_cache_group(kv_cache_config)
 
-        bind_kv_cache(
-            kv_caches,
-            self.vllm_config.compilation_config.static_forward_context,
-            self.kv_caches)
-
         if has_kv_transfer_group():
             get_kv_transfer_group().register_kv_caches(kv_caches)
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 94e438fb4..4c8ef0eaa 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1365,14 +1365,20 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         assert self.block_table_cpu.dtype == self.input_batch.block_table[
             0].get_cpu_tensor().dtype
 
-        kv_caches: dict[str, torch.Tensor] = {}
+        kv_cache_sizes = {}
+        for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+            assert len(kv_cache_tensor.shared_by) == 1, (
+                "KV cache tensor shared by multiple layers is not supported in "
+                "TPU.")
+            kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
 
+        kv_caches: dict[str, torch.Tensor] = {}
         for kv_cache_group in kv_cache_config.kv_cache_groups:
             kv_cache_spec = kv_cache_group.kv_cache_spec
             for layer_name in kv_cache_group.layer_names:
-                tensor_config = kv_cache_config.tensors[layer_name]
-                assert tensor_config.size % kv_cache_spec.page_size_bytes == 0
-                num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes
+                tensor_size = kv_cache_sizes[layer_name]
+                assert tensor_size % kv_cache_spec.page_size_bytes == 0
+                num_blocks = tensor_size // kv_cache_spec.page_size_bytes  # noqa
                 if isinstance(kv_cache_spec, AttentionSpec):
                     if self.use_spmd:
                         num_kv_heads = kv_cache_spec.num_kv_heads
-- 
GitLab


From b61dc5f972b3130ab6a3bc0fb57b990d02a3f8fb Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Thu, 5 Jun 2025 21:27:38 -0700
Subject: [PATCH 202/274] [TPU] update torch_xla pin (#19231)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 requirements/tpu.txt         | 10 +++++-----
 tests/tpu/test_moe_pallas.py |  2 +-
 vllm/v1/worker/tpu_worker.py |  3 ++-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 47e638463..a26dfd460 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -18,9 +18,9 @@ setuptools==78.1.0
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.8.0.dev20250529
-torchvision==0.22.0.dev20250529
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch==2.8.0.dev20250605
+torchvision==0.23.0.dev20250605
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250605-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250605-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250605-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
 
diff --git a/tests/tpu/test_moe_pallas.py b/tests/tpu/test_moe_pallas.py
index ab6cd3069..407a824d8 100644
--- a/tests/tpu/test_moe_pallas.py
+++ b/tests/tpu/test_moe_pallas.py
@@ -27,7 +27,7 @@ TOP_KS = [2, 6]
 # The Pallas GMM kernel requires num_tokens * topk to be a multiple of 16
 @pytest.mark.parametrize("m", [8, 16, 64, 2048])
 @pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 512, 1024])
+@pytest.mark.parametrize("k", [128, 511, 1024])
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("ep_size", EP_SIZE)
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 8d2f8112d..16a9f0959 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -100,7 +100,8 @@ class TPUWorker:
         # `xla_tpu_force_1d_allreduce_at_chunk_count` is a temporary solution to
         # fix this. It will be removed after the bug in XLA compiler is fixed.
         os.environ["LIBTPU_INIT_ARGS"] = (
-            "--xla_tpu_force_1d_allreduce_at_chunk_count=1")
+            os.environ.get("LIBTPU_INIT_ARGS", "") +
+            " --xla_tpu_force_1d_allreduce_at_chunk_count=1")
         torch.set_grad_enabled(False)
         torch.set_default_dtype(self.model_config.dtype)
 
-- 
GitLab


From 3da2313d781f73c4b3b6bd57a130f85b7c0f0ca4 Mon Sep 17 00:00:00 2001
From: Xu Song <xusong.vip@gmail.com>
Date: Fri, 6 Jun 2025 13:06:48 +0800
Subject: [PATCH 203/274] Support allowed_token_ids in ChatCompletionRequest
 (#19143)

Signed-off-by: Xu Song <xusong.vip@gmail.com>
---
 vllm/entrypoints/openai/protocol.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index ecfcc0068..79f0f200c 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -271,6 +271,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     spaces_between_special_tokens: bool = True
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
     prompt_logprobs: Optional[int] = None
+    allowed_token_ids: Optional[list[int]] = None
     # --8<-- [end:chat-completion-sampling-params]
 
     # --8<-- [start:chat-completion-extra-params]
@@ -549,6 +550,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
                 else RequestOutputKind.FINAL_ONLY,
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias,
+            allowed_token_ids=self.allowed_token_ids,
             extra_args=({"kv_transfer_params": self.kv_transfer_params}
                         if self.kv_transfer_params else None))
 
-- 
GitLab


From 91a2ef98eab486361edb1c57240372108ec8d9e3 Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Fri, 6 Jun 2025 02:09:43 -0400
Subject: [PATCH 204/274] [Chore] update CODEOWNERS (#19247)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
---
 .github/CODEOWNERS | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 4452ce22d..e98ccd035 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -10,15 +10,17 @@
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
-/vllm/model_executor/guided_decoding @mgoin @russellb
+/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
 /vllm/multimodal @DarkLight1337 @ywang96
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
+/vllm/reasoning @aarnphm
+/vllm/entrypoints @aarnphm
 CMakeLists.txt @tlrmchlsmth
 
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb
+/vllm/v1/structured_output @mgoin @russellb @aarnphm
 
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
@@ -27,8 +29,8 @@ CMakeLists.txt @tlrmchlsmth
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
-/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
+/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
 /tests/kernels @tlrmchlsmth @WoosukKwon
 /tests/model_executor/test_guided_processors.py @mgoin @russellb
 /tests/models @DarkLight1337 @ywang96
@@ -38,11 +40,11 @@ CMakeLists.txt @tlrmchlsmth
 /tests/quantization @mgoin @robertgshaw2-redhat
 /tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
-/tests/v1/structured_output @mgoin @russellb
+/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
+/tests/v1/structured_output @mgoin @russellb @aarnphm
 /tests/weight_loading @mgoin @youkaichao
 /tests/lora @jeejeelee
 
 # Docs
 /docs @hmellor
-mkdocs.yaml @hmellor
\ No newline at end of file
+mkdocs.yaml @hmellor
-- 
GitLab


From 90b78ec5f97384f10cb210368421ae3da17a3823 Mon Sep 17 00:00:00 2001
From: Jinghui Zhang <jinghuizhang0804@gmail.com>
Date: Thu, 5 Jun 2025 23:32:55 -0700
Subject: [PATCH 205/274] [v1][P/D] Fix a edge case in kv cache schedule
 (#19182)

Co-authored-by: jinghui <jinghui@fb.com>
---
 vllm/v1/core/sched/scheduler.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 7347f8e46..cb9fa61db 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1009,6 +1009,8 @@ class Scheduler(SchedulerInterface):
         # Now that the blocks are ready, actually cache them.
         block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0]
         num_computed_tokens = len(block_ids) * self.block_size
+        # Handle the case where num request tokens less then one block.
+        num_computed_tokens = min(num_computed_tokens, request.num_tokens)
         if num_computed_tokens == request.num_tokens:
             num_computed_tokens -= 1
         self.kv_cache_manager.cache_blocks(
-- 
GitLab


From 0d49483ea97705f531dd42383ecbb2476d7dfa2b Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Fri, 6 Jun 2025 01:20:16 -0700
Subject: [PATCH 206/274] [TPU] fix kv cache dtype in model runner (#19244)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 vllm/v1/worker/tpu_model_runner.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 4c8ef0eaa..843bc3695 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -29,7 +29,8 @@ from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs,
                                     PlaceholderRange)
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sequence import IntermediateTensors
-from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, cdiv,
+                        is_pin_memory_available)
 from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
@@ -138,6 +139,11 @@ class TPUModelRunner(LoRAModelRunnerMixin):
 
         self.pin_memory = is_pin_memory_available()
         self.dtype = self.model_config.dtype
+        if cache_config.cache_dtype == "auto":
+            self.kv_cache_dtype = self.dtype
+        else:
+            self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
+                cache_config.cache_dtype]
         self._hidden_states_dtype = self.dtype
 
         self.is_multimodal_model = model_config.is_multimodal_model
@@ -480,7 +486,7 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                         block_size=block_size,
                         num_kv_heads=attn_module.num_kv_heads,
                         head_size=attn_module.head_size,
-                        dtype=attn_module.dtype,
+                        dtype=self.kv_cache_dtype,
                         sliding_window=attn_module.sliding_window,
                         use_mla=False,
                     )
@@ -489,7 +495,7 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                         block_size=block_size,
                         num_kv_heads=attn_module.num_kv_heads,
                         head_size=attn_module.head_size,
-                        dtype=attn_module.dtype,
+                        dtype=self.kv_cache_dtype,
                         use_mla=False,
                     )
             elif attn_module.attn_type in (AttentionType.ENCODER,
-- 
GitLab


From 94870359cdce95cc1c56d7ae2730e8ac9be40049 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 6 Jun 2025 04:21:54 -0400
Subject: [PATCH 207/274] [Quantization] Bump compressed-tensors version;
 update NVFP4A16 test model (#19224)

Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 requirements/common.txt                                     | 2 +-
 tests/quantization/test_compressed_tensors.py               | 3 +--
 .../quantization/compressed_tensors/compressed_tensors.py   | 6 +++---
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index de4b3b531..a6a1ffe76 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -37,7 +37,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.9.4 # required for compressed-tensors
+compressed-tensors == 0.10.0 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 03480343d..2c07fe29f 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -651,10 +651,9 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
         assert output
 
 
-@pytest.mark.skip(reason="Skip until the model config is updated")
 def test_compressed_tensors_nvfp4a16(vllm_runner):
     # run weight only example
-    model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP4"
+    model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16"
     with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index dff62af86..1ee4617e1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -222,15 +222,15 @@ class CompressedTensorsConfig(QuantizationConfig):
                          input_quant: BaseModel):
 
         is_weight_only = weight_quant is not None and input_quant is None
-        is_group_quant = (
-            weight_quant.strategy == QuantizationStrategy.GROUP.value)
+        is_tensor_group_quant = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value)
         is_symmetric = weight_quant.symmetric
 
         is_group_size_16 = weight_quant.group_size == 16
         is_float_type = weight_quant.type == QuantizationType.FLOAT
         is_4_bits = weight_quant.num_bits == 4
 
-        return (is_weight_only and is_group_quant and is_float_type
+        return (is_weight_only and is_tensor_group_quant and is_float_type
                 and is_4_bits and is_group_size_16 and is_symmetric)
 
     def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
-- 
GitLab


From 65c69444b1b7d5da9ec5482ca27e32a752cd0a29 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 6 Jun 2025 01:22:45 -0700
Subject: [PATCH 208/274] [Docs] Improve V1 KVConnector interface documentation
 (#19172)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 .../kv_transfer/kv_connector/v1/base.py       | 29 +++++++++++++++++--
 vllm/v1/core/sched/scheduler.py               | 12 ++++----
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 8f9d70eec..f80b5eba2 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -8,9 +8,15 @@ The class provides the following primitives:
     Scheduler-side: runs in the scheduler, binds metadata, which
     is used by the worker-side to load/save KV cache.
         get_num_new_matched_tokens() - get number of new tokens 
-            that exist in the remote KV cache
+            that exist in the remote KV cache. Might be called multiple
+            times for a given request and should be side-effect free.
         update_state_after_alloc() - update KVConnector state after
             temporary buffer alloc by the CacheManager.
+        request_finished() - called when a request is finished, with
+            the computed kv cache blocks for the request.
+            Returns whether KV cache should be freed now or will be
+            freed asynchronously and optionally returns KV transfer
+            params.
 
     Worker-side: runs in each worker, loads/saves KV cache to/from
     the Connector based on the metadata.
@@ -19,6 +25,9 @@ The class provides the following primitives:
 
         save_kv_layer() - starts saving KV for layer i (maybe async)
         wait_for_save() - blocks until all saves are done
+
+        get_finished() - called with ids of finished requests, returns
+            ids of requests that have completed async sending/recving.
 """
 
 import enum
@@ -184,7 +193,8 @@ class KVConnectorBase_V1(ABC):
         finished generating tokens.
 
         Returns:
-            ids of requests that have finished asynchronous transfer,
+            ids of requests that have finished asynchronous transfer
+            (requests that previously returned True from request_finished()),
             tuple of (sending/saving ids, recving/loading ids).
             The finished saves/sends req ids must belong to a set provided in a
             call to this method (this call or a prior one).
@@ -215,7 +225,8 @@ class KVConnectorBase_V1(ABC):
                 - The number of tokens that can be loaded from the 
                   external KV cache beyond what is already computed.
                 - `True` if external KV cache tokens will be loaded
-                  asynchronously (between scheduler steps).
+                  asynchronously (between scheduler steps). Must be
+                  'False' if the first element is 0.
         """
         pass
 
@@ -225,6 +236,18 @@ class KVConnectorBase_V1(ABC):
                                  num_external_tokens: int):
         """
         Update KVConnector state after block allocation.
+
+        If get_num_new_matched_tokens previously returned True for a
+        request, this function may be called twice for that same request -
+        first when blocks are allocated for the connector tokens to be
+        asynchronously loaded into, and second when any additional blocks
+        are allocated, after the load/transfer is complete.
+
+        Args:
+            request (Request): the request object.
+            blocks (KVCacheBlocks): the blocks allocated for the request.
+            num_external_tokens (int): the number of tokens that will be
+                loaded from the external KV cache.
         """
         pass
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index cb9fa61db..f3b5c7482 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -101,7 +101,7 @@ class Scheduler(SchedulerInterface):
         # This is flushed at the end of each scheduling step.
         self.finished_req_ids: set[str] = set()
 
-        # P/D: requests in process of recving KV transfers
+        # KV Connector: requests in process of async KV loading or recving
         self.finished_recving_kv_req_ids: set[str] = set()
 
         # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
@@ -822,7 +822,7 @@ class Scheduler(SchedulerInterface):
             if not stopped:
                 new_running.append(request)
 
-        # P/D: update state for finished KV Transfers.
+        # KV Connector: update state for finished KV Transfers.
         self._update_from_kv_xfer_finished(model_runner_output)
 
         # Return the cached request data to the queue so they can be reused.
@@ -969,7 +969,7 @@ class Scheduler(SchedulerInterface):
             self.kv_event_publisher.shutdown()
 
     ########################################################################
-    # P/D Related Methods
+    # KV Connector Related Methods
     ########################################################################
 
     def get_kv_connector(self) -> Optional[KVConnectorBase_V1]:
@@ -992,7 +992,7 @@ class Scheduler(SchedulerInterface):
 
     def _update_waiting_for_remote_kv(self, request: Request) -> bool:
         """
-        P/D: check if the request_id is finished_recving.
+        KV Connector: check if the request_id is finished_recving.
 
         The finished_recving_kv_req_ids list is populated
         on the previous steps()'s update_from_output based
@@ -1029,7 +1029,7 @@ class Scheduler(SchedulerInterface):
     def _update_from_kv_xfer_finished(self,
                                       model_runner_output: ModelRunnerOutput):
         """
-        P/D: update the scheduler state based on the output.
+        KV Connector: update the scheduler state based on the output.
 
         The Worker side connectors add finished_recving and
         finished_sending reqs to the output.
@@ -1037,7 +1037,7 @@ class Scheduler(SchedulerInterface):
         # if finished_recving: add to state so we can
             scheduler the request during the next step.
         """
-        # P/D: update recv and send status from last step.
+        # KV Connector:: update recv and send status from last step.
         for req_id in (model_runner_output.finished_recving or ()):
             logger.debug("Finished recving KV transfer for request %s", req_id)
             self.finished_recving_kv_req_ids.add(req_id)
-- 
GitLab


From da511d54d881385cdee45979fee5a5c1c39cdf3c Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Fri, 6 Jun 2025 04:23:35 -0400
Subject: [PATCH 209/274] Fix CompilationConfig repr (#19091)

Signed-off-by: rzou <zou3519@gmail.com>
---
 tests/test_config.py | 13 +++++++++++++
 vllm/config.py       | 35 ++++++++++++++++++++---------------
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index dffea9138..ce383e1b4 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -6,6 +6,7 @@ from typing import Literal, Union
 
 import pytest
 
+from vllm.compilation.backends import VllmBackend
 from vllm.config import (LoadConfig, ModelConfig, PoolerConfig, VllmConfig,
                          config, get_field)
 from vllm.model_executor.layers.pooler import PoolingType
@@ -44,6 +45,18 @@ def test_config(test_config, expected_error):
         config(test_config)
 
 
+def test_compile_config_repr_succeeds():
+    # setup: VllmBackend mutates the config object
+    config = VllmConfig()
+    backend = VllmBackend(config)
+    backend.configure_post_pass()
+
+    # test that repr(config) succeeds
+    val = repr(config)
+    assert 'VllmConfig' in val
+    assert 'inductor_passes' in val
+
+
 def test_get_field():
 
     @dataclass
diff --git a/vllm/config.py b/vllm/config.py
index cd6ac4f89..31a1d208e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -648,7 +648,7 @@ class ModelConfig:
     def maybe_pull_model_tokenizer_for_s3(self, model: str,
                                           tokenizer: str) -> None:
         """Pull model/tokenizer from S3 to temporary directory when needed.
-        
+
         Args:
             model: Model name or path
             tokenizer: Tokenizer name or path
@@ -1370,9 +1370,9 @@ class ModelConfig:
     def is_encoder_decoder(self) -> bool:
         """Extract the HF encoder/decoder model flag."""
         """
-        For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to 
+        For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to
         True to enable cross-attention
-        Neuron needs all multimodal data to be in the decoder and does not 
+        Neuron needs all multimodal data to be in the decoder and does not
         need to explicitly enable cross-attention
         """
         if (current_platform.is_neuron()
@@ -1794,7 +1794,7 @@ class ParallelConfig:
     """Global rank in distributed setup."""
 
     enable_multimodal_encoder_data_parallel: bool = False
-    """ Use data parallelism instead of tensor parallelism for vision encoder. 
+    """ Use data parallelism instead of tensor parallelism for vision encoder.
     Only support LLama4 for now"""
 
     @property
@@ -2272,9 +2272,9 @@ class DeviceConfig:
 
     device: SkipValidation[Union[Device, torch.device]] = "auto"
     """Device type for vLLM execution.
-    This parameter is deprecated and will be 
-    removed in a future release. 
-    It will now be set automatically based 
+    This parameter is deprecated and will be
+    removed in a future release.
+    It will now be set automatically based
     on the current platform."""
     device_type: str = field(init=False)
     """Device type from the current platform. This is set in
@@ -4007,19 +4007,24 @@ class CompilationConfig:
 
     def __repr__(self) -> str:
         exclude = {
-            "static_forward_context",
-            "enabled_custom_ops",
-            "disabled_custom_ops",
-            "compilation_time",
-            "bs_to_padded_graph_size",
-            "pass_config",
-            "traced_files",
+            "static_forward_context": True,
+            "enabled_custom_ops": True,
+            "disabled_custom_ops": True,
+            "compilation_time": True,
+            "bs_to_padded_graph_size": True,
+            "pass_config": True,
+            "traced_files": True,
+            "inductor_compile_config": {
+                "post_grad_custom_post_pass": True,
+            },
         }
         # The cast to string is necessary because Pydantic is mocked in docs
         # builds and sphinx-argparse doesn't know the return type of decode()
         return str(
             TypeAdapter(CompilationConfig).dump_json(
-                self, exclude=exclude, exclude_unset=True).decode())
+                self,
+                exclude=exclude,  # type: ignore[arg-type]
+                exclude_unset=True).decode())
 
     __str__ = __repr__
 
-- 
GitLab


From f168b85725202915b5719c62b46d310a608b13dd Mon Sep 17 00:00:00 2001
From: Siqi Yan <ysq0807@hotmail.com>
Date: Fri, 6 Jun 2025 01:24:02 -0700
Subject: [PATCH 210/274] Unit Test for run_dp_sharded_vision_model (#19103)

Signed-off-by: Siqi Yan <siqi@meta.com>
Co-authored-by: Siqi Yan <siqi@meta.com>
---
 tests/multimodal/test_utils.py | 98 +++++++++++++++++++++++++++++++++-
 1 file changed, 97 insertions(+), 1 deletion(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index c8a544822..5ac0a90f5 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -9,12 +9,21 @@ from typing import TYPE_CHECKING, NamedTuple, Optional
 
 import numpy as np
 import pytest
+import torch
+import torch.multiprocessing as mp
 from PIL import Image, ImageChops
 
+from tests.utils import multi_gpu_test
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import (init_distributed_environment,
+                                             initialize_model_parallel)
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (MediaConnector,
-                                   merge_and_sort_multimodal_metadata)
+                                   merge_and_sort_multimodal_metadata,
+                                   run_dp_sharded_vision_model)
+from vllm.platforms import current_platform
+from vllm.utils import get_open_port, update_environment_variables
 
 if TYPE_CHECKING:
     from vllm.multimodal.hasher import MultiModalHashDict
@@ -413,3 +422,90 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
         assert modalities == expected_modalities
         assert ranges == expected_ranges
         assert hashes == expected_hashes
+
+
+class SimpleLinearModel(torch.nn.Module):
+    """A simple linear vision model for testing."""
+
+    def __init__(self, input_dim: int = 3 * 224 * 224, output_dim: int = 32):
+        super().__init__()
+        self.flatten = torch.nn.Flatten()
+        self.linear = torch.nn.Linear(input_dim, output_dim)
+
+    def forward(self, x: torch.Tensor):
+        # Flatten the input and apply linear transformation
+        x = self.flatten(x)
+        return self.linear(x)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "batch_size",
+    [
+        1,  # Single image
+        4,  # Small batch
+        5,  # Odd batch size (for testing padding)
+    ],
+)
+def test_run_dp_sharded_vision_model(batch_size: int):
+    world_size = 2
+    # Launch processes
+    mp.spawn(
+        run_dp_sharded_vision_model_vs_direct,
+        args=(
+            world_size,
+            batch_size,
+            get_open_port(),
+        ),
+        nprocs=world_size,
+    )
+
+
+def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
+                                          batch_size: int, master_port: int):
+    """
+    Test that run_dp_sharded_vision_model produces the same results as 
+    calling the model directly.
+    """
+
+    # Set random seed for reproducibility
+    current_platform.seed_everything(0)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': str(master_port),
+    })
+
+    # initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # Create a test input tensor
+    image_input = torch.randn(batch_size, 3, 224, 224)
+
+    # Create a simple linear model
+    vision_model = SimpleLinearModel()
+
+    # Run the model directly on the full input
+    with torch.inference_mode():
+        direct_output = vision_model(image_input)
+
+    # Run the model through the sharded function
+    with torch.inference_mode():
+        sharded_output = run_dp_sharded_vision_model(image_input, vision_model)
+
+    # Check that the world size is setup correctly
+    assert get_tensor_model_parallel_world_size() == world_size
+
+    # Check that the outputs have the same shape
+    assert direct_output.shape == sharded_output.shape
+
+    # Check that the outputs are close (they should be identical)
+    assert torch.allclose(direct_output, sharded_output, rtol=1e-5, atol=1e-5)
-- 
GitLab


From 7661e92ef85e552936195ae4b803e292b9a96776 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 6 Jun 2025 18:05:14 +0800
Subject: [PATCH 211/274] [Model] Optimize nemotron_h implementation (#19249)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/nemotron_h.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 2ef8d3115..3424efa80 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/vllm-project/vllm/blob/94d8ec8d2bcb4ec55e33022b313c7e978edf05e1/vllm/model_executor/models/bamba.py
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
@@ -29,7 +30,7 @@ from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.activation import ReLUSquaredActivation
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -63,19 +64,22 @@ class NemotronHMLP(nn.Module):
         config: NemotronHConfig,
         quant_config: Optional[QuantizationConfig] = None,
         bias: bool = False,
+        prefix: str = "",
     ) -> None:
         super().__init__()
-        self.up_proj = MergedColumnParallelLinear(
+        self.up_proj = ColumnParallelLinear(
             input_size=config.hidden_size,
-            output_sizes=[config.intermediate_size],
+            output_size=config.intermediate_size,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
         )
         self.down_proj = RowParallelLinear(
             input_size=config.intermediate_size,
             output_size=config.hidden_size,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
         self.act_fn = ReLUSquaredActivation()
 
@@ -99,9 +103,12 @@ class NemotronHMLPDecoderLayer(nn.Module):
         super().__init__()
         self.config = config
 
-        self.mixer = NemotronHMLP(config,
-                                  quant_config=quant_config,
-                                  bias=config.mlp_bias)
+        self.mixer = NemotronHMLP(
+            config,
+            quant_config=quant_config,
+            bias=config.mlp_bias,
+            prefix=f"{prefix}.mixer",
+        )
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
@@ -207,12 +214,14 @@ class NemotronHAttention(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             config.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.attn = Attention(
@@ -253,7 +262,7 @@ class NemotronHAttentionDecoderLayer(nn.Module):
             layer_idx,
             cache_config,
             quant_config,
-            prefix,
+            prefix=f"{prefix}.mixer",
         )
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -435,7 +444,6 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
             "k_proj",
             "v_proj",
         ],
-        "gate_up_proj": ["up_proj", "down_proj"]
     }
 
     # LoRA specific attributes
-- 
GitLab


From 7353492a472904d93854853e19b389e9656991d6 Mon Sep 17 00:00:00 2001
From: jmswen <jmswen@users.noreply.github.com>
Date: Fri, 6 Jun 2025 04:03:01 -0700
Subject: [PATCH 212/274] [Core] Raise when non-multi-instance DP clients
 target a DP rank (#19227)

Signed-off-by: Jon Swenson <jmswen@gmail.com>
---
 tests/async_engine/test_async_llm_engine.py | 22 ++++++++++++++++
 tests/v1/engine/test_async_llm.py           | 29 +++++++++++++++++++++
 tests/v1/test_async_llm_dp.py               | 25 +++++++++++-------
 vllm/engine/async_llm_engine.py             |  4 +++
 vllm/v1/engine/core_client.py               |  3 ---
 vllm/v1/engine/processor.py                 |  6 +++++
 6 files changed, 77 insertions(+), 12 deletions(-)

diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 1a31bdbfc..043b75cc5 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -384,3 +384,25 @@ async def test_delayed_generator(async_engine, stop):
     assert final_output is not None
     assert len(final_output.outputs[0].token_ids) == 10
     assert final_output.finished
+
+
+@pytest.mark.asyncio(scope="module")
+async def test_invalid_argument(async_engine):
+    scheduler_config = await async_engine.get_scheduler_config()
+
+    if scheduler_config.num_scheduler_steps != 1:
+        pytest.skip("no need to test this one with multistep")
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=10,
+        max_tokens=10,
+    )
+
+    # Targeting specific DP rank only supported in v1 multi-instance DP
+    with pytest.raises(ValueError):
+        async for _ in async_engine.generate("test",
+                                             sampling_params,
+                                             request_id=uid(),
+                                             data_parallel_rank=0):
+            pass
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 957d50d0d..a65fc35e0 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -250,3 +250,32 @@ async def test_customize_loggers(monkeypatch):
         assert len(engine.stat_loggers) == 1
         assert len(engine.stat_loggers[0]) == 1
         engine.stat_loggers[0][0].log.assert_called_once()
+
+
+@pytest.mark.asyncio(scope="module")
+async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
+        after.callback(engine.shutdown)
+
+        sampling_params = SamplingParams(max_tokens=100,
+                                         output_kind=RequestOutputKind.DELTA,
+                                         temperature=1.0,
+                                         seed=33)
+
+        # Test with valid DP rank.
+        async for _ in engine.generate(request_id="request-34",
+                                       prompt=TEXT_PROMPT,
+                                       sampling_params=sampling_params,
+                                       data_parallel_rank=0):
+            pass
+
+        # Test with out-of-range DP rank.
+        with pytest.raises(ValueError):
+            async for _ in engine.generate(request_id="request-35",
+                                           prompt=TEXT_PROMPT,
+                                           sampling_params=sampling_params,
+                                           data_parallel_rank=1):
+                pass
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
index 53242180b..075ceb257 100644
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -29,12 +29,14 @@ if not current_platform.supports_v1(engine_args.create_model_config()):
                 allow_module_level=True)
 
 
-async def generate(engine: AsyncLLM,
-                   request_id: str,
-                   prompt: PromptType,
-                   output_kind: RequestOutputKind,
-                   max_tokens: int,
-                   prompt_logprobs: Optional[int] = None) -> tuple[int, str]:
+async def generate(
+        engine: AsyncLLM,
+        request_id: str,
+        prompt: PromptType,
+        output_kind: RequestOutputKind,
+        max_tokens: int,
+        prompt_logprobs: Optional[int] = None,
+        data_parallel_rank: Optional[int] = None) -> tuple[int, str]:
     # Ensure generate doesn't complete too fast for cancellation test.
     await asyncio.sleep(0.2)
 
@@ -46,7 +48,8 @@ async def generate(engine: AsyncLLM,
                                      prompt_logprobs=prompt_logprobs)
     async for out in engine.generate(request_id=request_id,
                                      prompt=prompt,
-                                     sampling_params=sampling_params):
+                                     sampling_params=sampling_params,
+                                     data_parallel_rank=data_parallel_rank):
 
         num_tokens = len(out.outputs[0].token_ids)
         if output_kind == RequestOutputKind.DELTA:
@@ -89,8 +92,12 @@ async def test_load(output_kind: RequestOutputKind,
         for request_id in request_ids:
             tasks.append(
                 asyncio.create_task(
-                    generate(engine, request_id, prompt, output_kind,
-                             NUM_EXPECTED_TOKENS)))
+                    generate(engine,
+                             request_id,
+                             prompt,
+                             output_kind,
+                             NUM_EXPECTED_TOKENS,
+                             data_parallel_rank=0)))
         # Confirm that we got all the EXPECTED tokens from the requests.
         done, pending = await asyncio.wait(tasks,
                                            return_when=asyncio.FIRST_EXCEPTION)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 59971f5d6..72020a8cc 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -494,6 +494,10 @@ class _AsyncLLMEngine(LLMEngine):
         if arrival_time is None:
             arrival_time = time.time()
 
+        if data_parallel_rank is not None:
+            raise ValueError("Targeting data_parallel_rank only supported "
+                             "in v1 client.")
+
         if (isinstance(prompt, dict)
                 and prompt.get("prompt_embeds", None) is not None
                 and not prompt.get("prompt_token_ids", None)):
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index d1b0b300d..7eff377b7 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1000,9 +1000,6 @@ class DPAsyncMPClient(AsyncMPClient):
                                     ) -> CoreEngine:
         if dp_rank is not None:
             # engines are already in rank order
-            if dp_rank < 0 or dp_rank >= len(self.core_engines):
-                raise ValueError(f"Requested DP rank {dp_rank} is out of "
-                                 f"range [0, {len(self.core_engines)})")
             return self.core_engines[dp_rank]
 
         if not self.lb_engines:
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 546fc98d6..e28879d40 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -226,6 +226,12 @@ class Processor:
         if prompt_adapter_request is not None:
             raise ValueError("V1 does not support prompt_adapter_request.")
 
+        data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
+        if data_parallel_rank is not None and not (0 <= data_parallel_rank <
+                                                   data_parallel_size):
+            raise ValueError(f"data_parallel_rank {data_parallel_rank} "
+                             f"is out of range [0, {data_parallel_size}).")
+
         if arrival_time is None:
             arrival_time = time.time()
 
-- 
GitLab


From 8267f9916f9b4ceace3a8daa908d52824068862b Mon Sep 17 00:00:00 2001
From: Yu Guo <82124926+yuguo68@users.noreply.github.com>
Date: Fri, 6 Jun 2025 04:59:25 -0700
Subject: [PATCH 213/274] improve logits bias (#19041)

---
 vllm/v1/sample/sampler.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 8ba3c2087..6bc0cecdd 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -5,6 +5,7 @@
 import torch
 import torch.nn as nn
 
+from vllm.utils import async_tensor_h2d, is_pin_memory_available
 from vllm.v1.outputs import LogprobsTensors, SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.ops.bad_words import apply_bad_words
@@ -20,6 +21,7 @@ class Sampler(nn.Module):
     def __init__(self):
         super().__init__()
         self.topk_topp_sampler = TopKTopPSampler()
+        self.pin_memory = is_pin_memory_available()
 
     def forward(
         self,
@@ -232,6 +234,10 @@ class Sampler(nn.Module):
         # One idea is implement this as a PyTorch C++ op, and we may
         # even optimize the logit_bias layout.
 
+        rows: list[int] = []
+        cols: list[int] = []
+        vals: list[float] = []
+
         # Get vocabulary size from logits
         vocab_size = logits.shape[-1]
 
@@ -244,7 +250,16 @@ class Sampler(nn.Module):
                             f"token_id {token_id} in logit_bias contains "
                             f"out-of-vocab token id. Vocabulary size: "
                             f"{vocab_size}")
-                    logits[i, token_id] += bias
+                    rows.append(i)
+                    cols.append(token_id)
+                    vals.append(bias)
+
+        if rows:
+            indices = async_tensor_h2d([rows, cols], torch.int64,
+                                       logits.device, self.pin_memory)
+            values = async_tensor_h2d(vals, torch.float, logits.device,
+                                      self.pin_memory)
+            logits.index_put_(tuple(indices), values=values, accumulate=True)
         return logits
 
     def apply_allowed_token_ids(
-- 
GitLab


From 94ecee628258a2a62af06e92530128582e8f165e Mon Sep 17 00:00:00 2001
From: Nishidha <nishidha.panpaliya@partner.ibm.com>
Date: Sat, 7 Jun 2025 00:24:26 +0530
Subject: [PATCH 214/274] Fixed ppc build when it runs on non-RHEL based linux
 distros (#18422)

Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
Signed-off-by: Md. Shafi Hussain <Md.Shafi.Hussain@ibm.com>
Signed-off-by: npanpaliya <nishidha.panpaliya@partner.ibm.com>
Co-authored-by: Md. Shafi Hussain <Md.Shafi.Hussain@ibm.com>
---
 .../hardware_ci/run-cpu-test-ppc64le.sh       |  1 +
 docker/Dockerfile.ppc64le                     | 96 ++++++++++++++-----
 2 files changed, 71 insertions(+), 26 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 077bd9914..8f9b18437 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -7,6 +7,7 @@ set -ex
 # Setup cleanup
 remove_docker_container() {
   if [[ -n "$container_id" ]]; then
+      podman stop --all -t0
       podman rm -f "$container_id" || true
   fi
   podman system prune -f
diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le
index 14043eb7a..aaff24038 100644
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@@ -1,10 +1,41 @@
 ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
 
+###############################################################
+# Stage to build openblas
+###############################################################
+
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder
+
+ARG MAX_JOBS
+ARG OPENBLAS_VERSION=0.3.29
+RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
+    && source /opt/rh/gcc-toolset-13/enable \
+    && wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
+    && unzip OpenBLAS-$OPENBLAS_VERSION.zip \
+    && cd OpenBLAS-$OPENBLAS_VERSION \
+    &&  make -j${MAX_JOBS} TARGET=POWER9 BINARY=64 USE_OPENMP=1 USE_THREAD=1 NUM_THREADS=120 DYNAMIC_ARCH=1 INTERFACE64=0 \
+    && cd /tmp && touch control
+
+
+###############################################################
+# base stage with dependencies coming from centos mirrors
+###############################################################
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS centos-deps-builder
+RUN  microdnf install -y dnf && \ 
+     dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
+        https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
+        https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
+        dnf config-manager --set-enabled crb
+
+RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel && \
+    dnf remove -y centos-gpg-keys-9.0-24.el9.noarch centos-stream-repos-9.0-24.el9.noarch 
+
+
 ###############################################################
 # base stage with basic dependencies
 ###############################################################
 
-FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base-builder
+FROM centos-deps-builder AS base-builder
 
 ARG PYTHON_VERSION=3.12
 ARG OPENBLAS_VERSION=0.3.29
@@ -20,25 +51,27 @@ ENV UV_LINK_MODE=copy
 # Note: A symlink for libatomic.so is created for gcc-13 (linker fails to find libatomic otherwise - reqd. for sentencepiece)
 # Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
 #       when `--jobs=<N>` is passed with podman build command
-RUN microdnf install -y openssl-devel dnf \
-    && dnf install -y  https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
-    && dnf config-manager --set-enabled codeready-builder-for-rhel-9-ppc64le-rpms \
+
+COPY --from=openblas-builder /tmp/control /dev/null
+
+RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
+    dnf install -y openssl-devel \
     && dnf install -y \
-       git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \
+       git tar gcc-toolset-13 automake libtool \
        pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
-       libtiff-devel libjpeg-devel openjpeg2-devel zlib-devel \
-       freetype-devel lcms2-devel libwebp-devel tcl-devel tk-devel \
-       harfbuzz-devel fribidi-devel libraqm-devel libimagequant-devel libxcb-devel \
+       libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
+       harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
        python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
     && dnf clean all \
+    && PREFIX=/usr/local make -C /openblas install \
     && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
     && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
     && python -m pip install -U pip uv \
     && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \
-    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
     && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
     && cd /tmp && touch control
 
+
 ###############################################################
 # Stage to build torch family
 ###############################################################
@@ -48,6 +81,8 @@ FROM base-builder AS torch-builder
 ARG MAX_JOBS
 ARG TORCH_VERSION=2.6.0
 ARG _GLIBCXX_USE_CXX11_ABI=1
+ARG OPENBLAS_VERSION=0.3.29
+
 RUN --mount=type=cache,target=/root/.cache/uv \
     source /opt/rh/gcc-toolset-13/enable &&  \
     git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \
@@ -109,7 +144,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         .. && \
     make install -j ${MAX_JOBS:-$(nproc)} && \
     cd ../../python/ && \
-    uv pip install -v -r requirements-wheel-build.txt && \
+    uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \
+    pip show numpy && ls -lrt /opt/vllm/lib/python3.12/site-packages/numpy && \
     PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
     python setup.py build_ext \
     --build-type=release --bundle-arrow-cpp \
@@ -132,8 +168,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     cd opencv-python && \
     sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
     cd opencv && git cherry-pick --no-commit $OPENCV_PATCH && cd .. && \
+    uv pip install scikit-build && \    
     python -m build --wheel --installer=uv --outdir /opencvwheels/
 
+###############################################################
+# Stage to build numactl
+###############################################################
+
+FROM base-builder AS numa-builder
+
+# Note: Building numactl with gcc-11. Compiling with gcc-13 in this builder stage will
+# trigger recompilation with gcc-11 (and require libtool) in the final stage where we do not have gcc-13
+ARG MAX_JOBS
+ARG NUMACTL_VERSION=2.0.19
+RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_VERSION} \
+    && cd numactl \
+    && autoreconf -i && ./configure \
+    && make -j ${MAX_JOBS:-$(nproc)}
+
+
 ###############################################################
 # Stage to build vllm - this stage builds and installs
 # vllm, tensorizer and vllm-tgis-adapter and builds uv cache
@@ -145,6 +198,7 @@ FROM base-builder AS vllmcache-builder
 COPY --from=torch-builder /tmp/control /dev/null
 COPY --from=arrow-builder /tmp/control /dev/null
 COPY --from=cv-builder /tmp/control /dev/null
+COPY --from=numa-builder /tmp/control /dev/null
 
 ARG VLLM_TARGET_DEVICE=cpu
 ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
@@ -160,11 +214,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
     --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
     --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
+    --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
     --mount=type=bind,src=.,dst=/src/,rw \
     source /opt/rh/gcc-toolset-13/enable && \
     uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
     sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
     uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
+    make -C /numactl install && \
     # sentencepiece.pc is in some pkgconfig inside uv cache
     export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
     uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
@@ -173,21 +229,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install /vllmwheel/*.whl
 
 
-###############################################################
-# Stage to build numactl
-###############################################################
-
-FROM base-builder AS numa-builder
-
-# Note: Building numactl with gcc-11. Compiling with gcc-13 in this builder stage will
-# trigger recompilation with gcc-11 (and require libtool) in the final stage where we do not have gcc-13
-ARG MAX_JOBS
-ARG NUMACTL_VERSION=2.0.19
-RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_VERSION} \
-    && cd numactl \
-    && autoreconf -i && ./configure \
-    && make -j ${MAX_JOBS:-$(nproc)}
-
 ###############################################################
 # Stage to build lapack
 ###############################################################
@@ -217,6 +258,7 @@ ENV PATH=${VIRTUAL_ENV}/bin:$PATH
 ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib
 ENV UV_LINK_MODE=copy
+ENV OMP_NUM_THREADS=16
 
 # create artificial dependencies between stages for independent stages to build in parallel
 COPY --from=torch-builder /tmp/control /dev/null
@@ -225,11 +267,13 @@ COPY --from=cv-builder /tmp/control /dev/null
 COPY --from=vllmcache-builder /tmp/control /dev/null
 COPY --from=numa-builder /tmp/control /dev/null
 COPY --from=lapack-builder /tmp/control /dev/null
+COPY --from=openblas-builder /tmp/control /dev/null
 
 # install gcc-11, python, openblas, numactl, lapack
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
     --mount=type=bind,from=lapack-builder,source=/lapack/,target=/lapack/,rw \
+    --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
     rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
     microdnf install --nodocs -y \
     tar findutils openssl \
@@ -241,8 +285,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     && microdnf clean all \
     && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
     && python -m pip install -U pip uv --no-cache \
-    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
     && make -C /numactl install \
+    && PREFIX=/usr/local make -C /openblas install \
     && uv pip install 'cmake<4' \
     && cmake --install /lapack/build \
     && uv pip uninstall cmake
-- 
GitLab


From aad30bd306d30025dc05a7e9eb2aa3ed5c3c1ed0 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 6 Jun 2025 13:16:24 -0700
Subject: [PATCH 215/274] [BugFix] Fix MultiConnector test after HMA changes
 (#19291)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 .../v1/kv_connector/unit/test_multi_connector.py  | 15 ++++++++-------
 .../kv_connector/v1/multi_connector.py            |  4 ++--
 vllm/v1/core/kv_cache_manager.py                  |  4 ++++
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index d90ddcdbb..72848c1a7 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -60,7 +60,8 @@ class TestSharedStorageConnector(SharedStorageConnector):
                     if isinstance(arg, int):
                         to_log.append(str(arg))
                     elif isinstance(arg, KVCacheBlocks):
-                        to_log.append(f"num_blocks={len(arg.blocks)}")
+                        to_log.append(
+                            f"num_blocks={[len(b) for b in arg.blocks]}")
 
                 # Log the event as a line to the file
                 try:
@@ -176,7 +177,7 @@ def test_multi_shared_storage_connector_consistency():
     # on each connector in turn.
     assert events["storage1-SCHEDULER"][:3] == [
         'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=0 0', 'build_connector_meta'
+        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
     ]
     assert events["storage1-WORKER"][:5] == [
         'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
@@ -184,7 +185,7 @@ def test_multi_shared_storage_connector_consistency():
     ]
     assert events["storage2-SCHEDULER"][:3] == [
         'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=0 0', 'build_connector_meta'
+        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
     ]
     assert events["storage2-WORKER"][:5] == [
         'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
@@ -205,11 +206,11 @@ def test_multi_shared_storage_connector_consistency():
     # chosen).
     assert events["storage1-SCHEDULER"][:3] == [
         'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=7 96', 'build_connector_meta'
+        'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
     ]
     assert events["storage2-SCHEDULER"][:3] == [
         'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=0 0', 'build_connector_meta'
+        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
     ]
 
     # Delete storage1 connector state
@@ -229,11 +230,11 @@ def test_multi_shared_storage_connector_consistency():
     # blocks for the second connector.
     assert events["storage1-SCHEDULER"][:3] == [
         'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=0 0', 'build_connector_meta'
+        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
     ]
     assert events["storage2-SCHEDULER"][:3] == [
         'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=7 96', 'build_connector_meta'
+        'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
     ]
 
     # Clean up
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index b2cf88f5b..be3c23399 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -153,6 +153,7 @@ class MultiConnector(KVConnectorBase_V1):
                                  num_external_tokens: int):
         chosen_connector = self._requests_to_connector.get(
             request.request_id, -1)
+        empty_blocks = blocks.new_empty()
         for i, c in enumerate(self._connectors):
             if i == chosen_connector:
                 # Forward call to the chosen connector (if any).
@@ -160,8 +161,7 @@ class MultiConnector(KVConnectorBase_V1):
                                            num_external_tokens)
             else:
                 # Call with empty blocks for other connectors.
-                c.update_state_after_alloc(request,
-                                           KVCacheBlocks.create_empty(), 0)
+                c.update_state_after_alloc(request, empty_blocks, 0)
 
     def build_connector_meta(
             self,
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index fc701215b..35fb189fd 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -61,6 +61,10 @@ class KVCacheBlocks:
             if block.block_hash is None
         ]
 
+    def new_empty(self) -> "KVCacheBlocks":
+        """Creates a new KVCacheBlocks instance with no blocks."""
+        return KVCacheBlocks([[] for _ in range(len(self.blocks))])
+
 
 class KVCacheManager:
 
-- 
GitLab


From ca27f0f9c1452a0e73126be2b1666c3067cf6290 Mon Sep 17 00:00:00 2001
From: Adolfo Victoria <adolfokarim@gmail.com>
Date: Fri, 6 Jun 2025 13:17:54 -0700
Subject: [PATCH 216/274] [Bugfix][Core] Update cancellation logic in
 `generate()` to handle Generator exits (#19225)

Co-authored-by: Adolfo Victoria <adovi@meta.com>
---
 tests/v1/engine/test_async_llm.py | 169 ++++++++++++++++++++++--------
 vllm/v1/engine/async_llm.py       |   5 +-
 2 files changed, 129 insertions(+), 45 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index a65fc35e0..7dff937c0 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -22,9 +22,11 @@ if not current_platform.is_cuda():
     pytest.skip(reason="V1 currently only supported on CUDA.",
                 allow_module_level=True)
 
-TEXT_ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B-Instruct",
-                                   enforce_eager=True,
-                                   disable_log_requests=True)
+TEXT_ENGINE_ARGS = AsyncEngineArgs(
+    model="meta-llama/Llama-3.2-1B-Instruct",
+    enforce_eager=True,
+    disable_log_requests=True,
+)
 
 VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct",
                                      enforce_eager=True,
@@ -41,28 +43,33 @@ VISION_PROMPT = {
     "prompt": VISION_PROMPT_TEMPLATE,
     "multi_modal_data": {
         "image": ImageAsset("stop_sign").pil_image
-    }
+    },
 }
 
 
-async def generate(engine: AsyncLLM,
-                   request_id: str,
-                   prompt: PromptType,
-                   output_kind: RequestOutputKind,
-                   max_tokens: int,
-                   n: int = 1,
-                   prompt_logprobs: Optional[int] = None) -> tuple[int, str]:
+async def generate(
+    engine: AsyncLLM,
+    request_id: str,
+    prompt: PromptType,
+    output_kind: RequestOutputKind,
+    max_tokens: int,
+    n: int = 1,
+    prompt_logprobs: Optional[int] = None,
+    cancel_after: Optional[int] = None,
+) -> tuple[int, str]:
     # Ensure generate doesn't complete too fast for cancellation test.
     await asyncio.sleep(0.2)
 
     count = 0
-    sampling_params = SamplingParams(max_tokens=max_tokens,
-                                     ignore_eos=True,
-                                     output_kind=output_kind,
-                                     temperature=0.5,
-                                     seed=33,
-                                     n=n,
-                                     prompt_logprobs=prompt_logprobs)
+    sampling_params = SamplingParams(
+        max_tokens=max_tokens,
+        ignore_eos=True,
+        output_kind=output_kind,
+        temperature=0.5,
+        seed=33,
+        n=n,
+        prompt_logprobs=prompt_logprobs,
+    )
     async for out in engine.generate(request_id=request_id,
                                      prompt=prompt,
                                      sampling_params=sampling_params):
@@ -73,20 +80,27 @@ async def generate(engine: AsyncLLM,
         else:
             count = num_tokens
 
-        await asyncio.sleep(0.)
+        if cancel_after is not None and count >= cancel_after:
+            return count, request_id
+
+        await asyncio.sleep(0.0)
 
     return count, request_id
 
 
 @pytest.mark.parametrize(
     "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-@pytest.mark.parametrize("engine_args,prompt",
-                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
-                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
+@pytest.mark.parametrize(
+    "engine_args,prompt",
+    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
+)
 @pytest.mark.asyncio
-async def test_load(monkeypatch: pytest.MonkeyPatch,
-                    output_kind: RequestOutputKind,
-                    engine_args: AsyncEngineArgs, prompt: PromptType):
+async def test_load(
+    monkeypatch: pytest.MonkeyPatch,
+    output_kind: RequestOutputKind,
+    engine_args: AsyncEngineArgs,
+    prompt: PromptType,
+):
     # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
     # so that in the future when we switch, we don't have to change all the
     # tests.
@@ -125,13 +139,17 @@ async def test_load(monkeypatch: pytest.MonkeyPatch,
 
 @pytest.mark.parametrize(
     "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-@pytest.mark.parametrize("engine_args,prompt",
-                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
-                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
+@pytest.mark.parametrize(
+    "engine_args,prompt",
+    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
+)
 @pytest.mark.asyncio
-async def test_abort(monkeypatch: pytest.MonkeyPatch,
-                     output_kind: RequestOutputKind,
-                     engine_args: AsyncEngineArgs, prompt: PromptType):
+async def test_abort(
+    monkeypatch: pytest.MonkeyPatch,
+    output_kind: RequestOutputKind,
+    engine_args: AsyncEngineArgs,
+    prompt: PromptType,
+):
 
     with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
@@ -150,8 +168,9 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch,
         # Create concurrent requests.
         tasks: list[asyncio.Task] = []
         for idx, request_id in enumerate(request_ids):
-            max_tokens = NUM_EXPECTED_TOKENS_LONG if (
-                idx in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS
+            max_tokens = (NUM_EXPECTED_TOKENS_LONG if
+                          (idx
+                           in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS)
             n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
             tasks.append(
                 asyncio.create_task(
@@ -192,12 +211,17 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch,
 
 
 @pytest.mark.parametrize("n", [1, 3])
-@pytest.mark.parametrize("engine_args,prompt",
-                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
-                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
+@pytest.mark.parametrize(
+    "engine_args,prompt",
+    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
+)
 @pytest.mark.asyncio
-async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int,
-                             engine_args: AsyncEngineArgs, prompt: PromptType):
+async def test_finished_flag(
+    monkeypatch: pytest.MonkeyPatch,
+    n: int,
+    engine_args: AsyncEngineArgs,
+    prompt: PromptType,
+):
 
     with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
@@ -205,11 +229,13 @@ async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int,
         engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
 
-        sampling_params = SamplingParams(max_tokens=100,
-                                         output_kind=RequestOutputKind.DELTA,
-                                         temperature=1.0,
-                                         seed=33,
-                                         n=n)
+        sampling_params = SamplingParams(
+            max_tokens=100,
+            output_kind=RequestOutputKind.DELTA,
+            temperature=1.0,
+            seed=33,
+            n=n,
+        )
         outputs = [
             out
             async for out in engine.generate(request_id="request-33",
@@ -222,6 +248,63 @@ async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int,
         assert outputs[-1].finished
 
 
+@pytest.mark.parametrize(
+    "engine_args,prompt",
+    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
+)
+@pytest.mark.asyncio
+async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch,
+                                       engine_args: AsyncEngineArgs,
+                                       prompt: PromptType):
+    """Test that requests can be cancelled mid-stream."""
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
+
+        NUM_REQUESTS = 100
+        NUM_TOKENS = 1000
+        NUM_EXPECTED_TOKENS = 20
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests that will be cancelled mid-stream
+        tasks = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(
+                        engine,
+                        request_id,
+                        prompt,
+                        RequestOutputKind.DELTA,
+                        NUM_TOKENS,
+                        cancel_after=NUM_EXPECTED_TOKENS,
+                    )))
+
+        # Wait for all tasks to complete
+        results = await asyncio.gather(*tasks)
+
+        # Verify all tasks were cancelled at the expected point
+        for num_generated_tokens, request_id in results:
+            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                f"{request_id} generated {num_generated_tokens} tokens but "
+                f"expected to cancel after {NUM_EXPECTED_TOKENS}")
+
+        # Make sure no requests are left hanging
+        assert not engine.output_processor.has_unfinished_requests()
+
+        # Confirm we can reuse the request id after the cancellations.
+        request_id = request_ids[0]
+        task = asyncio.create_task(
+            generate(engine, request_id, prompt, RequestOutputKind.DELTA,
+                     NUM_EXPECTED_TOKENS))
+        num_generated_tokens, request_id = await task
+        assert num_generated_tokens == NUM_EXPECTED_TOKENS
+        assert not engine.output_processor.has_unfinished_requests()
+
+
 class MockLoggingStatLogger(LoggingStatLogger):
 
     def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 089f15aee..7fb36cf59 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -332,8 +332,9 @@ class AsyncLLM(EngineClient):
                 yield out
 
         # If the request is disconnected by the client, generate()
-        # is cancelled. So, we abort the request if we end up here.
-        except asyncio.CancelledError:
+        # is cancelled or the generator is garbage collected. So,
+        # we abort the request if we end up here.
+        except (asyncio.CancelledError, GeneratorExit):
             await self.abort(request_id)
             if self.log_requests:
                 logger.info("Request %s aborted.", request_id)
-- 
GitLab


From b6a3a9f76db737e8b3679d824b8cd744e883e7fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 7 Jun 2025 01:27:59 +0200
Subject: [PATCH 217/274] [Core] Fix abrupt request abort (#18485)

Signed-off-by: nicklucche <nlucches@redhat.com>
Signed-off-by: Nick Hill <nhill@redhat.com>

Co-authored-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/core/kv_cache_coordinator.py |  2 +-
 vllm/v1/core/sched/scheduler.py      | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 993ce4b48..231bad1df 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -164,7 +164,7 @@ class KVCacheCoordinator(ABC):
         Get the blocks for the request.
         """
         return [
-            manager.req_to_blocks[request_id]
+            manager.req_to_blocks.get(request_id) or []
             for manager in self.single_type_managers
         ]
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index f3b5c7482..b3293d9a5 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -76,6 +76,9 @@ class Scheduler(SchedulerInterface):
         # KV Connector pushes/pull of remote KVs for P/D and offloading.
         self.connector = None
         if self.vllm_config.kv_transfer_config is not None:
+            assert len(self.kv_cache_config.kv_cache_groups) == 1, (
+                "Multiple KV cache groups are not currently supported "
+                "with KV connectors")
             self.connector = KVConnectorFactory.create_connector_v1(
                 config=self.vllm_config, role=KVConnectorRole.SCHEDULER)
 
@@ -985,9 +988,8 @@ class Scheduler(SchedulerInterface):
         """
         if self.connector is None:
             return False, None
-        assert len(self.kv_cache_config.kv_cache_groups
-                   ) == 1, "KV connector only supports one KV cache group now"
-        block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0]
+
+        (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id)
         return self.connector.request_finished(request, block_ids)
 
     def _update_waiting_for_remote_kv(self, request: Request) -> bool:
@@ -1002,12 +1004,12 @@ class Scheduler(SchedulerInterface):
         and the request state will be moved back to WAITING from
         WAITING_FOR_REMOTE_KV.
         """
+        assert self.connector is not None
         if request.request_id not in self.finished_recving_kv_req_ids:
             return False
-        assert len(self.kv_cache_config.kv_cache_groups
-                   ) == 1, "KV connector only supports one KV cache group now"
+
         # Now that the blocks are ready, actually cache them.
-        block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0]
+        (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id)
         num_computed_tokens = len(block_ids) * self.block_size
         # Handle the case where num request tokens less then one block.
         num_computed_tokens = min(num_computed_tokens, request.num_tokens)
-- 
GitLab


From 46ecc579733f13a555ca42da76c1234c586271eb Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 6 Jun 2025 16:28:17 -0700
Subject: [PATCH 218/274] [BugFix] Fix tpu_model_runner block_id concatenation
 (#19228)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/tpu/worker/test_tpu_model_runner.py | 2 +-
 vllm/v1/worker/gpu_model_runner.py           | 7 +++++--
 vllm/v1/worker/tpu_model_runner.py           | 6 +++++-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index e351f0e92..73c0da45d 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -226,7 +226,7 @@ def test_update_states_request_resumed(model_runner):
         req_id=req_id,
         resumed_from_preemption=False,
         new_token_ids=[],
-        new_block_ids=[],
+        new_block_ids=[[]],
         num_computed_tokens=0,
     )
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a90c294a9..e3535ef14 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -460,8 +460,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # Update the block IDs.
             if not req_data.resumed_from_preemption:
                 # Append the new blocks to the existing block IDs.
-                for i in range(len(self.kv_cache_config.kv_cache_groups)):
-                    req_state.block_ids[i].extend(req_data.new_block_ids[i])
+                for block_ids, new_block_ids in zip(  # type: ignore[call-overload]
+                        req_state.block_ids,
+                        req_data.new_block_ids,
+                        strict=True):
+                    block_ids.extend(new_block_ids)
             else:
                 # The request is resumed from preemption.
                 # Replace the existing block IDs with the new ones.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 843bc3695..d5f40e4d3 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -413,7 +413,11 @@ class TPUModelRunner(LoRAModelRunnerMixin):
             req_state.num_computed_tokens = req_data.num_computed_tokens
             if not req_data.resumed_from_preemption:
                 # Append the new blocks to the existing block IDs.
-                req_state.block_ids.extend(req_data.new_block_ids)
+                for block_ids, new_block_ids in zip(  # type: ignore[call-overload]
+                        req_state.block_ids,
+                        req_data.new_block_ids,
+                        strict=True):
+                    block_ids.extend(new_block_ids)
             else:
                 # The request is resumed from preemption.
                 # Replace the existing block IDs with the new ones.
-- 
GitLab


From 441b65d8c7818bff8385a28c72ad8354bd51d53f Mon Sep 17 00:00:00 2001
From: Chenyaaang <42742451+Chenyaaang@users.noreply.github.com>
Date: Fri, 6 Jun 2025 16:31:19 -0700
Subject: [PATCH 219/274] [Misc][Tools][Benchmark] Fix and improve auto tune
 script (#19163)

Signed-off-by: Chenyaaang <chenyangli@google.com>
---
 benchmarks/auto_tune.sh | 164 ++++++++++++++++++++++++----------------
 1 file changed, 98 insertions(+), 66 deletions(-)

diff --git a/benchmarks/auto_tune.sh b/benchmarks/auto_tune.sh
index ea63c6f71..1b01bbd61 100644
--- a/benchmarks/auto_tune.sh
+++ b/benchmarks/auto_tune.sh
@@ -10,11 +10,15 @@
 # 3. Set variables (ALL REQUIRED)
 #   BASE: your directory for vllm repo
 #   MODEL: the model served by vllm
+#   TP: ways of tensor parallelism
 #   DOWNLOAD_DIR: directory to download and load model weights.
 #   INPUT_LEN: request input len
 #   OUTPUT_LEN: request output len
 #   MIN_CACHE_HIT_PCT: prefix cache rate
 #   MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
+#   NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
+#   NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
+#   Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
 # 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
 # 5. The final result will be saved in RESULT file. 
 
@@ -30,31 +34,27 @@
 TAG=$(date +"%Y_%m_%d_%H_%M")
 BASE=""
 MODEL="meta-llama/Llama-3.1-8B-Instruct"
+TP=1
 DOWNLOAD_DIR=""
 INPUT_LEN=4000
 OUTPUT_LEN=16
-MIN_CACHE_HIT_PCT_PCT=0
+MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=100000000000
+NUM_SEQS_LIST="128 256"
+NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
 
 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
 
-echo "result file$ $RESULT"
+echo "result file: $RESULT"
 echo "model: $MODEL"
-echo
 
 rm -rf $LOG_FOLDER
 mkdir -p $LOG_FOLDER
 
 cd "$BASE/vllm"
-# create sonnet-4x.txt so that we can sample 2048 tokens for input
-echo "" > benchmarks/sonnet_4x.txt
-for _ in {1..4}
-do
-cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
-done
 
-pip install datasets
+pip install -q datasets
 
 current_hash=$(git rev-parse HEAD)
 echo "hash:$current_hash" >> "$RESULT"
@@ -64,53 +64,69 @@ best_throughput=0
 best_max_num_seqs=0
 best_num_batched_tokens=0
 best_goodput=0
-run_benchmark() {
-    local max_num_seqs=$1
-    local max_num_batched_tokens=$2
-    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
-    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
-    echo "vllm_log: $vllm_log"
-    echo
-    rm -f $vllm_log
 
-    # start the server
+start_server() {
+    local gpu_memory_utilization=$1
+    local max_num_seqs=$2
+    local max_num_batched_tokens=$3
+    local vllm_log=$4
+    
+    pkill -f vllm
+
     VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
         --disable-log-requests \
         --port 8004 \
-        --gpu-memory-utilization 0.98 \
+        --gpu-memory-utilization $gpu_memory_utilization \
         --max-num-seqs $max_num_seqs \
         --max-num-batched-tokens $max_num_batched_tokens \
-        --tensor-parallel-size 1 \
+        --tensor-parallel-size $TP \
         --enable-prefix-caching \
         --load-format dummy \
-        --download-dir $DOWNLOAD_DIR \
+        --download-dir "$DOWNLOAD_DIR" \
         --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
-    echo "wait for 10 minutes.."
-    echo
+
     # wait for 10 minutes...
     server_started=0
-    for i in {1..60}; do        
-        if grep -Fq "Application startup complete" "$vllm_log"; then
-            echo "Application started"
+    for i in {1..60}; do  
+        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
+        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) 
+        if [[ "$STATUS_CODE" -eq 200 ]]; then
             server_started=1
             break
         else
-            # echo "wait for 10 seconds..."
             sleep 10
         fi
     done
- 
     if (( ! server_started )); then
-        echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
-        echo "pkill -f vllm"
-        echo
-        pkill vllm
-        sleep 10
+        echo "server did not start within 10 minutes. Please check server log at $vllm_log".
         return 1
+    else
+        return 0
     fi
+}
+
+run_benchmark() {
+    local max_num_seqs=$1
+    local max_num_batched_tokens=$2
+    local gpu_memory_utilization=$3
+    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
+    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
+    echo "vllm_log: $vllm_log"
+    echo
+    rm -f $vllm_log
+    pkill -f vllm
+
+    echo "starting server..."
+    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
+    result=$?
+    if [[ "$result" -eq 1 ]]; then
+        echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
+    else
+        echo "server started."
+    fi
+    echo
     
     echo "run benchmark test..."
-    echo
     meet_latency_requirement=0
     # get a basic qps by using request-rate inf
     bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
@@ -118,29 +134,29 @@ run_benchmark() {
     python benchmarks/benchmark_serving.py \
         --backend vllm \
         --model $MODEL  \
-        --dataset-name sonnet \
-        --dataset-path benchmarks/sonnet_4x.txt \
-        --sonnet-input-len $INPUT_LEN \
-        --sonnet-output-len $OUTPUT_LEN \
+        --dataset-name random \
+        --random-input-len $INPUT_LEN \
+        --random-output-len $OUTPUT_LEN \
         --ignore-eos \
         --disable-tqdm \
         --request-rate inf \
         --percentile-metrics ttft,tpot,itl,e2el \
         --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
-        --num-prompts 100 \
-        --sonnet-prefix-len $prefix_len \
-        --port 8004 > "$bm_log"
-    through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+        --num-prompts 1000 \
+        --random-prefix-len $prefix_len \
+        --port 8004 &> "$bm_log"
+    throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
     e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
     goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
 
     if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
         meet_latency_requirement=1
+        request_rate=inf
     fi
 
     if (( ! meet_latency_requirement )); then
-    # start from request-rate as int(through_put) + 1
-        request_rate=$((${through_put%.*} + 1))
+    # start from request-rate as int(throughput) + 1
+        request_rate=$((${throughput%.*} + 1))
         while ((request_rate > 0)); do
             # clear prefix cache
             curl -X POST http://0.0.0.0:8004/reset_prefix_cache
@@ -149,19 +165,18 @@ run_benchmark() {
             python benchmarks/benchmark_serving.py \
                 --backend vllm \
                 --model $MODEL  \
-                --dataset-name sonnet \
-                --dataset-path benchmarks/sonnet_4x.txt \
-                --sonnet-input-len $INPUT_LEN \
-                --sonnet-output-len $OUTPUT_LEN \
-                --ignore_eos \
+                --dataset-name random \
+                --random-input-len $INPUT_LEN \
+                --random-output-len $OUTPUT_LEN \
+                --ignore-eos \
                 --disable-tqdm \
                 --request-rate $request_rate \
                 --percentile-metrics ttft,tpot,itl,e2el \
                 --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
                 --num-prompts 100 \
-                --sonnet-prefix-len $prefix_len \
-                --port 8004 > "$bm_log"
-            through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+                --random-prefix-len $prefix_len \
+                --port 8004 &> "$bm_log"
+            throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
             e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
             goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
             if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
@@ -173,10 +188,10 @@ run_benchmark() {
     fi
     # write the results and update the best result.
     if ((meet_latency_requirement)); then
-        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
-        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
-        if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
-            best_throughput=$through_put
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
+        if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
+            best_throughput=$throughput
             best_max_num_seqs=$max_num_seqs
             best_num_batched_tokens=$max_num_batched_tokens
             best_goodput=$goodput
@@ -188,22 +203,39 @@ run_benchmark() {
 
     echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
 
-    echo "pkill -f vllm"
-    echo
     pkill vllm
     sleep 10
-    rm -f $vllm_log
     printf '=%.0s' $(seq 1 20)
     return 0
 }
 
+read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
+read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
+
+# first find out the max gpu-memory-utilization without HBM OOM.
+gpu_memory_utilization=0.98
+find_gpu_memory_utilization=0
+while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
+    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
+    result=$?
+    if [[ "$result" -eq 0 ]]; then
+        find_gpu_memory_utilization=1
+        break
+    else
+        gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
+    fi
+done
+
+if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
+    echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
+else
+    echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
+    exit 1
+fi
 
-num_seqs_list="128 256"
-num_batched_tokens_list="512 1024 2048 4096"
-for num_seqs in $num_seqs_list; do
-    for num_batched_tokens in $num_batched_tokens_list; do
-        run_benchmark $num_seqs $num_batched_tokens
-        exit 0
+for num_seqs in "${num_seqs_list[@]}"; do
+    for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
+        run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
     done
 done
 echo "finish permutations"
-- 
GitLab


From e010688f50a2e90587a01ada351c5023a10273c2 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Fri, 6 Jun 2025 18:35:16 -0500
Subject: [PATCH 220/274] [Build][ROCm] Update Dockerfile.rocm (#19296)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
---
 docker/Dockerfile.rocm | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index b186f88d2..34641dd0a 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -28,7 +28,8 @@ ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
 ARG VLLM_BRANCH="main"
 ONBUILD RUN git clone ${VLLM_REPO} \
 	    && cd vllm \
-	    && git checkout ${VLLM_BRANCH}
+	    && git fetch -v --prune -- origin ${VLLM_BRANCH} \
+	    && git checkout FETCH_HEAD
 FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 
 # -----------------------
-- 
GitLab


From 6e0cd10f726692de1d30523eff0fd7d2b927f375 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Sat, 7 Jun 2025 09:19:09 +0800
Subject: [PATCH 221/274] [Easy][Test] Simplify test_function_tool_use with
 multiple parametrizes (#19269)

Signed-off-by: Lu Fang <lufang@fb.com>
---
 .../test_completion_with_function_calling.py  | 73 ++++---------------
 1 file changed, 16 insertions(+), 57 deletions(-)

diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index 5a1832865..84ad7a091 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import NamedTuple
-
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
@@ -39,53 +37,14 @@ async def client(server):
         yield async_client
 
 
-class TestCase(NamedTuple):
-    model_name: str
-    stream: bool
-    tool_choice: str
-    enable_thinking: bool
-
-
 @pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        TestCase(model_name=MODEL_NAME,
-                 stream=True,
-                 tool_choice="auto",
-                 enable_thinking=False),
-        TestCase(model_name=MODEL_NAME,
-                 stream=False,
-                 tool_choice="auto",
-                 enable_thinking=False),
-        TestCase(model_name=MODEL_NAME,
-                 stream=True,
-                 tool_choice="required",
-                 enable_thinking=False),
-        TestCase(model_name=MODEL_NAME,
-                 stream=False,
-                 tool_choice="required",
-                 enable_thinking=False),
-        TestCase(model_name=MODEL_NAME,
-                 stream=True,
-                 tool_choice="auto",
-                 enable_thinking=True),
-        TestCase(model_name=MODEL_NAME,
-                 stream=False,
-                 tool_choice="auto",
-                 enable_thinking=True),
-        TestCase(model_name=MODEL_NAME,
-                 stream=True,
-                 tool_choice="required",
-                 enable_thinking=True),
-        TestCase(model_name=MODEL_NAME,
-                 stream=False,
-                 tool_choice="required",
-                 enable_thinking=True),
-    ],
-)
-async def test_function_tool_use(client: openai.AsyncOpenAI,
-                                 test_case: TestCase):
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("stream", [True, False])
+@pytest.mark.parametrize("tool_choice", ["auto", "required"])
+@pytest.mark.parametrize("enable_thinking", [True, False])
+async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
+                                 stream: bool, tool_choice: str,
+                                 enable_thinking: bool):
     tools = [
         {
             "type": "function",
@@ -174,16 +133,16 @@ async def test_function_tool_use(client: openai.AsyncOpenAI,
             "forecast for the next 5 days, in fahrenheit?",
         },
     ]
-    if not test_case.stream:
+    if not stream:
         # Non-streaming test
         chat_completion = await client.chat.completions.create(
             messages=messages,
-            model=test_case.model_name,
+            model=model_name,
             tools=tools,
-            tool_choice=test_case.tool_choice,
+            tool_choice=tool_choice,
             extra_body={
                 "chat_template_kwargs": {
-                    "enable_thinking": test_case.enable_thinking
+                    "enable_thinking": enable_thinking
                 }
             })
 
@@ -191,20 +150,20 @@ async def test_function_tool_use(client: openai.AsyncOpenAI,
         assert len(chat_completion.choices[0].message.tool_calls) > 0
     else:
         # Streaming test
-        stream = await client.chat.completions.create(
+        output_stream = await client.chat.completions.create(
             messages=messages,
-            model=test_case.model_name,
+            model=model_name,
             tools=tools,
-            tool_choice=test_case.tool_choice,
+            tool_choice=tool_choice,
             stream=True,
             extra_body={
                 "chat_template_kwargs": {
-                    "enable_thinking": test_case.enable_thinking
+                    "enable_thinking": enable_thinking
                 }
             })
 
         output = []
-        async for chunk in stream:
+        async for chunk in output_stream:
             if chunk.choices and chunk.choices[0].delta.tool_calls:
                 output.extend(chunk.choices[0].delta.tool_calls)
 
-- 
GitLab


From 84166fee9770e6fba71a96978b3e7d149392fb28 Mon Sep 17 00:00:00 2001
From: ElizaWszola <ewszola@redhat.com>
Date: Sat, 7 Jun 2025 03:26:11 +0200
Subject: [PATCH 222/274] [Kernel] Integrate CUTLASS MoE kernel with PPLX
 (#18762)

Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 CMakeLists.txt                                |   4 +-
 .../kernels/benchmark_grouped_gemm_cutlass.py |  61 +--
 csrc/ops.h                                    |  11 +-
 .../cutlass_w8a8/moe/grouped_mm_c3x.cu        |  29 +-
 .../cutlass_w8a8/moe/grouped_mm_c3x.cuh       |   6 +-
 .../quantization/cutlass_w8a8/moe/moe_data.cu |  42 +-
 .../cutlass_w8a8/scaled_mm_entry.cu           |  39 +-
 csrc/torch_bindings.cpp                       |  19 +-
 tests/kernels/moe/test_cutlass_moe.py         |   8 +-
 tests/kernels/moe/test_pplx_cutlass_moe.py    | 287 ++++++++++++++
 tests/kernels/moe/test_pplx_moe.py            | 131 +------
 .../quantization/test_cutlass_scaled_mm.py    |   3 +-
 tests/pplx_utils.py                           | 123 ++++++
 vllm/_custom_ops.py                           |  29 +-
 .../layers/fused_moe/batched_deep_gemm_moe.py |   1 +
 .../batched_triton_or_deep_gemm_moe.py        |   5 +-
 .../layers/fused_moe/cutlass_moe.py           | 363 +++++++++++-------
 .../layers/fused_moe/deep_gemm_moe.py         |   1 +
 .../layers/fused_moe/fused_batched_moe.py     |   2 +
 .../layers/fused_moe/fused_moe.py             |   1 +
 vllm/model_executor/layers/fused_moe/layer.py |  58 ++-
 .../layers/fused_moe/modular_kernel.py        |   3 +-
 .../layers/fused_moe/pplx_prepare_finalize.py |  21 +-
 .../layers/fused_moe/triton_deep_gemm_moe.py  |   5 +-
 .../compressed_tensors_moe.py                 |  87 +++--
 .../model_executor/layers/quantization/fp8.py |   2 +-
 26 files changed, 925 insertions(+), 416 deletions(-)
 create mode 100644 tests/kernels/moe/test_pplx_cutlass_moe.py
 create mode 100644 tests/pplx_utils.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f11d28590..afaed7cd1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -543,8 +543,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # CUTLASS MoE kernels
 
   # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
-  # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
-  # to compile MoE kernels that use its output.
+  # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
+  # if it's possible to compile MoE kernels that use its output.
   cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
     set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index 1be83b84e..acabe6c1d 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -7,8 +7,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
 
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    cutlass_moe_fp8,
     fused_experts,
     fused_topk,
 )
@@ -70,18 +70,9 @@ def bench_run(
     w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
     w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
 
-    ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-    c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
-    ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
-    c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-
     for expert in range(num_experts):
         w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
         w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
-    w1_q_notransp = w1_q.clone()
-    w2_q_notransp = w2_q.clone()
-    w1_q = w1_q.transpose(1, 2)
-    w2_q = w2_q.transpose(1, 2)
 
     score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
 
@@ -122,10 +113,6 @@ def bench_run(
         w2_scale: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        c_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides2: torch.Tensor,
         num_repeats: int,
     ):
         for _ in range(num_repeats):
@@ -133,14 +120,10 @@ def bench_run(
                 a,
                 w1,
                 w2,
-                w1_scale,
-                w2_scale,
                 topk_weights,
                 topk_ids,
-                ab_strides1,
-                c_strides1,
-                ab_strides2,
-                c_strides2,
+                w1_scale,
+                w2_scale,
                 a1_scale=a_scale,
             )
 
@@ -153,10 +136,6 @@ def bench_run(
         w2_scale: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        c_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides2: torch.Tensor,
     ):
         with set_current_vllm_config(
             VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
@@ -165,14 +144,10 @@ def bench_run(
                 a,
                 w1_q,
                 w2_q,
-                w1_scale,
-                w2_scale,
                 topk_weights,
                 topk_ids,
-                ab_strides1,
-                c_strides1,
-                ab_strides2,
-                c_strides2,
+                w1_scale,
+                w2_scale,
                 a1_scale=a_scale,
             )
 
@@ -218,10 +193,6 @@ def bench_run(
             w2_scale,
             topk_weights,
             topk_ids,
-            ab_strides1,
-            c_strides1,
-            ab_strides2,
-            c_strides2,
         )
     torch.cuda.synchronize()
 
@@ -230,8 +201,8 @@ def bench_run(
     with torch.cuda.graph(triton_graph, stream=triton_stream):
         run_triton_from_graph(
             a,
-            w1_q_notransp,
-            w2_q_notransp,
+            w1_q,
+            w2_q,
             topk_weights,
             topk_ids,
             w1_scale,
@@ -250,18 +221,12 @@ def bench_run(
         "w2": w2,
         "score": score,
         "topk": topk,
-        "w1_q_notransp": w1_q_notransp,
-        "w2_q_notransp": w2_q_notransp,
         # Cutlass params
         "a_scale": a_scale,
         "w1_q": w1_q,
         "w2_q": w2_q,
         "w1_scale": w1_scale,
         "w2_scale": w2_scale,
-        "ab_strides1": ab_strides1,
-        "c_strides1": c_strides1,
-        "ab_strides2": ab_strides2,
-        "c_strides2": c_strides2,
         # cuda graph params
         "cutlass_graph": cutlass_graph,
         "triton_graph": triton_graph,
@@ -279,8 +244,8 @@ def bench_run(
     # Warmup
     run_triton_moe(
         a,
-        w1_q_notransp,
-        w2_q_notransp,
+        w1_q,
+        w2_q,
         topk_weights,
         topk_ids,
         w1_scale,
@@ -291,7 +256,7 @@ def bench_run(
 
     results.append(
         benchmark.Timer(
-            stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
+            stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
@@ -322,16 +287,12 @@ def bench_run(
         w2_scale,
         topk_weights,
         topk_ids,
-        ab_strides1,
-        c_strides1,
-        ab_strides2,
-        c_strides2,
         num_warmup,
     )
 
     results.append(
         benchmark.Timer(
-            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)",  # noqa: E501
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
diff --git a/csrc/ops.h b/csrc/ops.h
index 6905ef6e5..f02f5083a 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -236,7 +236,8 @@ void cutlass_moe_mm(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides);
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch);
 
 void cutlass_fp4_group_mm(
     torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
@@ -251,6 +252,14 @@ void get_cutlass_moe_mm_data(
     const int64_t num_experts, const int64_t n, const int64_t k,
     const std::optional<torch::Tensor>& blockscale_offsets);
 
+void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
+                                  torch::Tensor& problem_sizes1,
+                                  torch::Tensor& problem_sizes2,
+                                  const torch::Tensor& expert_num_tokens,
+                                  const int64_t num_local_experts,
+                                  const int64_t padded_m, const int64_t n,
+                                  const int64_t k);
+
 void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
index 2b8bc3fb0..c88e134ae 100644
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
@@ -84,7 +84,8 @@ void run_cutlass_moe_mm_sm90(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
   TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
   TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
   TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
@@ -113,19 +114,23 @@ void run_cutlass_moe_mm_sm90(
   if (n >= 8192) {
     cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   } else if (k >= 8192) {
     cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   } else if (m <= 16) {
     cutlass_group_gemm_caller<Cutlass3xGemmM16>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   } else {
     cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   }
 }
 
@@ -134,15 +139,18 @@ void dispatch_moe_mm_sm90(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
   if (out_tensors.dtype() == torch::kBFloat16) {
     run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   } else {
     run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   }
 }
 
@@ -153,8 +161,9 @@ void cutlass_moe_mm_sm90(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
   dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                        expert_offsets, problem_sizes, a_strides, b_strides,
-                       c_strides);
+                       c_strides, per_act_token, per_out_ch);
 }
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
index db827b7c5..bbd82d72e 100644
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
@@ -76,7 +76,8 @@ void cutlass_group_gemm_caller(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
 
@@ -84,9 +85,6 @@ void cutlass_group_gemm_caller(
   int k_size = a_tensors.size(1);
   int n_size = out_tensors.size(1);
 
-  bool per_act_token = a_scales.numel() != 1;
-  bool per_out_ch = b_scales.numel() != num_experts;
-
   auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
 
   auto options_int =
diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
index ac414e1bc..32254641c 100644
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@@ -7,7 +7,7 @@
 
 constexpr uint64_t THREADS_PER_EXPERT = 512;
 
-__global__ void compute_problem_sizes(const int* __restrict__ topk_ids,
+__global__ void compute_problem_sizes(const uint32_t* __restrict__ topk_ids,
                                       int32_t* problem_sizes1,
                                       int32_t* problem_sizes2,
                                       int32_t* atomic_buffer,
@@ -62,7 +62,7 @@ __global__ void compute_expert_blockscale_offsets(
   }
 }
 
-__global__ void compute_arg_sorts(const int* __restrict__ topk_ids,
+__global__ void compute_arg_sorts(const uint32_t* __restrict__ topk_ids,
                                   const int32_t* __restrict__ expert_offsets,
                                   int32_t* input_permutation,
                                   int32_t* output_permutation,
@@ -103,7 +103,7 @@ void get_cutlass_moe_mm_data_caller(
 
   int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
   compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
-      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<const uint32_t*>(topk_ids.data_ptr()),
       static_cast<int32_t*>(problem_sizes1.data_ptr()),
       static_cast<int32_t*>(problem_sizes2.data_ptr()),
       static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
@@ -120,10 +120,44 @@ void get_cutlass_moe_mm_data_caller(
         static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
   }
   compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
-      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<const uint32_t*>(topk_ids.data_ptr()),
       static_cast<const int32_t*>(expert_offsets.data_ptr()),
       static_cast<int32_t*>(input_permutation.data_ptr()),
       static_cast<int32_t*>(output_permutation.data_ptr()),
       static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),
       topk_ids.size(1));
 }
+
+__global__ void compute_pplx_data(int32_t* expert_offsets,
+                                  int32_t* problem_sizes1,
+                                  int32_t* problem_sizes2,
+                                  const int32_t* __restrict__ expert_num_tokens,
+                                  const int padded_m, const int n,
+                                  const int k) {
+  int expert_idx = threadIdx.x;
+
+  expert_offsets[expert_idx] = expert_idx * padded_m;
+  problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx];
+  problem_sizes1[expert_idx * 3 + 1] = 2 * n;
+  problem_sizes1[expert_idx * 3 + 2] = k;
+  problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx];
+  problem_sizes2[expert_idx * 3 + 1] = k;
+  problem_sizes2[expert_idx * 3 + 2] = n;
+}
+
+void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
+                                         torch::Tensor& problem_sizes1,
+                                         torch::Tensor& problem_sizes2,
+                                         const torch::Tensor& expert_num_tokens,
+                                         const int64_t num_local_experts,
+                                         const int64_t padded_m,
+                                         const int64_t n, const int64_t k) {
+  auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index());
+
+  compute_pplx_data<<<1, num_local_experts, 0, stream>>>(
+      static_cast<int32_t*>(expert_offsets.data_ptr()),
+      static_cast<int32_t*>(problem_sizes1.data_ptr()),
+      static_cast<int32_t*>(problem_sizes2.data_ptr()),
+      static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
+      k);
+}
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index ee93440b5..348525810 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -36,7 +36,8 @@ void cutlass_moe_mm_sm90(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides);
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch);
 
 #endif
 
@@ -56,6 +57,14 @@ void get_cutlass_moe_mm_data_caller(
     torch::Tensor& input_permutation, torch::Tensor& output_permutation,
     const int64_t num_experts, const int64_t n, const int64_t k,
     const std::optional<torch::Tensor>& blockscale_offsets);
+
+void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
+                                         torch::Tensor& problem_sizes1,
+                                         torch::Tensor& problem_sizes2,
+                                         const torch::Tensor& expert_num_tokens,
+                                         const int64_t num_local_experts,
+                                         const int64_t padded_m,
+                                         const int64_t n, const int64_t k);
 #endif
 
 void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
@@ -207,12 +216,13 @@ void cutlass_moe_mm(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
   int32_t version_num = get_sm_version_num();
 #if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
   cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                       expert_offsets, problem_sizes, a_strides, b_strides,
-                      c_strides);
+                      c_strides, per_act_token, per_out_ch);
   return;
 #endif
   TORCH_CHECK_NOT_IMPLEMENTED(
@@ -245,6 +255,29 @@ void get_cutlass_moe_mm_data(
       version_num, ". Required capability: 90");
 }
 
+void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
+                                  torch::Tensor& problem_sizes1,
+                                  torch::Tensor& problem_sizes2,
+                                  const torch::Tensor& expert_num_tokens,
+                                  const int64_t num_local_experts,
+                                  const int64_t padded_m, const int64_t n,
+                                  const int64_t k) {
+  // This function currently gets compiled only if we have a valid cutlass moe
+  // mm to run it for.
+  int32_t version_num = get_sm_version_num();
+#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
+  get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
+                                      problem_sizes2, expert_num_tokens,
+                                      num_local_experts, padded_m, n, k);
+  return;
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
+      "for CUDA device capability: ",
+      version_num, ". Required capability: 90");
+}
+
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 93916b7f9..1a1896b4c 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -435,7 +435,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "cutlass_moe_mm(Tensor! out_tensors, Tensor a_tensors, Tensor b_tensors, "
       "               Tensor a_scales, Tensor b_scales, Tensor expert_offsets, "
       "               Tensor problem_sizes, Tensor a_strides, "
-      "               Tensor b_strides, Tensor c_strides) -> ()",
+      "               Tensor b_strides, Tensor c_strides, bool per_act_token, "
+      "               bool per_out_ch) -> ()",
       {stride_tag});
   ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm);
 
@@ -454,6 +455,22 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       {stride_tag});
   ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
 
+  // A function that computes data required to run fused MoE with w8a8 grouped
+  // GEMM and PPLX. It takes expert_num_tokens and non_zero_expert_idxs
+  // as an input, and computes expert_offsets (token start indices of each
+  // expert). In addition to this, it computes problem sizes for each expert's
+  // multiplication used by the two mms called from fused MoE operation.
+  ops.def(
+      "get_cutlass_pplx_moe_mm_data(Tensor! expert_offsets, "
+      "                             Tensor! problem_sizes1, "
+      "                             Tensor! problem_sizes2, "
+      "                             Tensor expert_num_tokens, "
+      "                             int num_local_experts, int padded_m, "
+      "                             int n, int k) -> ()",
+      {stride_tag});
+  ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA,
+           &get_cutlass_pplx_moe_mm_data);
+
   // Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
   ops.def(
       "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
index 558288ba4..474745f94 100644
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -193,14 +193,10 @@ def run_8_bit(moe_tensors: MOETensors8Bit,
 
     kwargs = {
         'a': moe_tensors.a,
-        'w1_q': moe_tensors.w1_q.transpose(1, 2),  # type: ignore[union-attr]
-        'w2_q': moe_tensors.w2_q.transpose(1, 2),  # type: ignore[union-attr]
+        'w1_q': moe_tensors.w1_q,  # type: ignore[union-attr]
+        'w2_q': moe_tensors.w2_q,  # type: ignore[union-attr]
         'topk_weights': topk_weights,
         'topk_ids': topk_ids,
-        'ab_strides1': moe_tensors.ab_strides1,
-        'c_strides1': moe_tensors.c_strides1,
-        'ab_strides2': moe_tensors.ab_strides2,
-        'c_strides2': moe_tensors.c_strides2,
         'w1_scale': moe_tensors.w1_scale,
         'w2_scale': moe_tensors.w2_scale,
         'a1_scale': moe_tensors.a_scale
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
new file mode 100644
index 000000000..ef3e6adcf
--- /dev/null
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -0,0 +1,287 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.pplx_utils import ProcessGroupInfo, parallel_launch
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.platforms import current_platform
+
+try:
+    from pplx_kernels import AllToAll
+    from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id,
+                                      nvshmem_finalize, nvshmem_get_unique_id,
+                                      nvshmem_init)
+    has_pplx = True
+except ImportError:
+    has_pplx = False
+
+requires_pplx = pytest.mark.skipif(
+    not has_pplx,
+    reason="Requires PPLX kernels",
+)
+
+NUM_EXPERTS = [40, 64]
+TOP_KS = [6, 8]
+
+
+def rank_chunk(num, r, w):
+    rem = num % w
+    return (num // w) + (1 if r < rem else 0)
+
+
+def chunk_by_rank(t, r, w):
+    num = t.shape[0]
+    chunk = rank_chunk(num, r, w)
+    rem = num % w
+    if rem == 0 or r < rem:
+        return t[(r * chunk):(r + 1) * chunk].contiguous()
+    else:
+        long_chunks = (num // w + 1) * rem
+        short_chunks = (r - rem) * chunk
+        start = long_chunks + short_chunks
+        return t[start:start + chunk].contiguous()
+
+
+def pplx_cutlass_moe(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    a1_scale: torch.Tensor,
+    out_dtype,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
+    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+        PplxPrepareAndFinalize)
+    assert torch.cuda.current_device() == pgi.local_rank
+
+    num_tokens, hidden_dim = a.shape
+    num_experts = w1.shape[0]
+    block_size = hidden_dim  # TODO support more cases
+    device = pgi.device
+    rank = pgi.rank
+    world_size = pgi.world_size
+    rank_num_tokens = rank_chunk(num_tokens, rank, world_size)
+    max_num_tokens = rank_chunk(num_tokens, 0, world_size)
+    topk = topk_ids.shape[1]
+
+    if block_size == hidden_dim:
+        scale_elems = 4  # hack to circumvent pplx data format requirements
+    else:
+        scale_elems = (hidden_dim + block_size - 1) // block_size
+
+    ata = AllToAll.internode(
+        max_num_tokens=max_num_tokens,
+        num_experts=num_experts,
+        experts_per_token=topk,
+        rank=rank,
+        world_size=pgi.world_size,
+        dp_size=dp_size,
+        hidden_dim=hidden_dim,
+        hidden_dim_bytes=hidden_dim,  # because a.dtype.itemsize == 1
+        hidden_dim_scale_bytes=scale_elems * torch.float32.itemsize,
+    )
+
+    w1 = w1.to(device)
+    w2 = w2.to(device)
+    w1_scale = w1_scale.to(device)
+    w2_scale = w2_scale.to(device)
+    a1_scale = a1_scale.to(device)
+
+    prepare_finalize = PplxPrepareAndFinalize(
+        ata,
+        max_num_tokens,
+        pgi.world_size,
+        rank,
+        dp_size,
+        quant_dtype=torch.float8_e4m3fn,
+        per_act_token=per_act_token,
+    )
+
+    experts = CutlassExpertsFp8((num_experts + world_size - 1) // world_size,
+                                out_dtype, per_act_token, per_out_ch)
+
+    fused_cutlass_experts = FusedMoEModularKernel(
+        prepare_finalize,
+        experts,
+    )
+
+    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
+    chunk_topk_weight = chunk_by_rank(topk_weights, rank,
+                                      world_size).to(device)
+    chunk_topk_ids = chunk_by_rank(topk_ids, rank,
+                                   world_size).to(torch.uint32).to(device)
+
+    out = fused_cutlass_experts(
+        a_chunk,
+        chunk_by_rank(w1, rank, world_size),
+        chunk_by_rank(w2, rank, world_size),
+        chunk_topk_weight,
+        chunk_topk_ids,
+        global_num_experts=num_experts,
+        expert_map=None,  #TODO
+        w1_scale=chunk_by_rank(w1_scale, rank, world_size),
+        w2_scale=chunk_by_rank(w2_scale, rank, world_size),
+        a1_scale=chunk_by_rank(a1_scale, rank, world_size)
+        if per_act_token else a1_scale[rank])
+
+    torch.cuda.synchronize()
+
+    ata.destroy()
+
+    return out[:rank_num_tokens]
+
+
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+
+def torch_moe2(a, w1, w2, topk_weight, topk_ids):
+    M, K = a.shape
+    topk = topk_ids.shape[1]
+    a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)
+    out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    num_experts = w1.shape[0]
+    for i in range(num_experts):
+        mask = (topk_ids == i).view(-1)
+        if mask.sum():
+            out[mask] = SiluAndMul()(
+                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
+
+    return (out.view(M, -1, w2.shape[1]) *
+            topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+def _pplx_moe(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    a1_scale: torch.Tensor,
+    out_dtype,
+    a_full: torch.Tensor,
+    w1_full: torch.Tensor,
+    w2_full: torch.Tensor,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
+    uid = nvshmem_get_unique_id(
+    ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+    torch.distributed.broadcast(uid, src=0)
+    nvshmem_init(uid, pgi.rank, pgi.world_size)
+
+    with set_current_vllm_config(vllm_config):
+        torch_output = torch_moe2(a_full, w1_full, w2_full, topk_weights,
+                                  topk_ids)
+        pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
+                                       w2_scale, topk_weights, topk_ids,
+                                       a1_scale, out_dtype, per_act_token,
+                                       per_out_ch)
+
+        torch_output = chunk_by_rank(torch_output, pgi.rank,
+                                     pgi.world_size).to(pplx_output.device)
+
+    # Uncomment if more debugging is needed
+    # print("PPLX OUT:", pplx_output)
+    # print("TORCH OUT:", torch_output)
+
+    torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0)
+
+    nvshmem_finalize()
+
+
+@pytest.mark.parametrize("m", [2, 224])
+@pytest.mark.parametrize("n", [3072])
+@pytest.mark.parametrize("k", [1536])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("world_dp_size", [[2, 1]])  #, [4, 2]])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+@requires_pplx
+def test_cutlass_moe_pplx(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+    world_dp_size: tuple[int, int],
+):
+    current_platform.seed_everything(7)
+
+    with set_current_vllm_config(vllm_config):
+
+        dtype = torch.half
+
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10.0
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10.0
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10.0
+
+        n_b_scales = 2 * n if per_out_ch else 1
+        k_b_scales = k if per_out_ch else 1
+
+        w1_q = torch.empty((e, 2 * n, k),
+                           device="cuda",
+                           dtype=torch.float8_e4m3fn)
+        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
+        w1_scale = torch.empty((e, n_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+
+        for expert in range(e):
+            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+                w1[expert], use_per_token_if_dynamic=per_out_ch)
+            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+                w2[expert], use_per_token_if_dynamic=per_out_ch)
+
+        w1_d = torch.empty_like(w1)
+        w2_d = torch.empty_like(w2)
+        for expert in range(e):
+            w1_d[expert] = (w1_q[expert].float() * w1_scale[expert]).half()
+            w2_d[expert] = (w2_q[expert].float() * w2_scale[expert]).half()
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids, _ = fused_topk(a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
+
+        world_size, dp_size = world_dp_size
+        a_scale1 = torch.randn(
+            (m if per_act_token else 1, 1), device="cuda",
+            dtype=torch.float32) / 10.0
+        if not per_act_token:
+            a_scale1 = a_scale1.repeat(world_size, 1)
+
+        parallel_launch(world_size, _pplx_moe, dp_size, a, w1_q, w2_q,
+                        w1_scale, w2_scale, topk_weights, topk_ids, a_scale1,
+                        dtype, a, w1_d, w2_d, per_act_token, per_out_ch)
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index 95c10037b..bbfe31d0e 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -4,10 +4,7 @@
 
 Run `pytest tests/kernels/test_pplx_moe.py`.
 """
-import dataclasses
-import os
-import traceback
-from typing import Callable, Optional
+from typing import Optional
 
 import pytest
 import torch
@@ -21,10 +18,7 @@ try:
 except ImportError:
     has_pplx = False
 
-from torch.multiprocessing import (
-    spawn)  # pyright: ignore[reportPrivateImportUsage]
-from typing_extensions import Concatenate, ParamSpec
-
+from tests.pplx_utils import ProcessGroupInfo, parallel_launch
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import override_config
@@ -36,6 +30,11 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
 from vllm.platforms import current_platform
 
+requires_pplx = pytest.mark.skipif(
+    not has_pplx,
+    reason="Requires PPLX kernels",
+)
+
 PPLX_PREPARE_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512),
                        (222, 2048, 1024)]
 
@@ -57,122 +56,6 @@ vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
 vllm_config.scheduler_config.max_model_len = 8192
 
-P = ParamSpec("P")
-
-requires_pplx = pytest.mark.skipif(
-    not has_pplx,
-    reason="Requires PPLX kernels",
-)
-
-
-@dataclasses.dataclass
-class ProcessGroupInfo:
-    world_size: int
-    world_local_size: int
-    rank: int
-    node_rank: int
-    local_rank: int
-    device: torch.device
-
-
-def _worker_parallel_launch(
-    local_rank: int,
-    world_size: int,
-    world_local_size: int,
-    node_rank: int,
-    init_method: str,
-    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
-    *args: P.args,
-    **kwargs: P.kwargs,
-) -> None:
-    rank = node_rank * world_local_size + local_rank
-    torch.cuda.set_device(local_rank)
-    device = torch.device("cuda", local_rank)
-    torch.distributed.init_process_group(
-        backend="cpu:gloo,cuda:nccl",
-        init_method=init_method,
-        rank=rank,
-        world_size=world_size,
-        device_id=device,
-    )
-    barrier = torch.tensor([rank], device=device)
-    torch.distributed.all_reduce(barrier)
-
-    try:
-        worker(
-            ProcessGroupInfo(
-                world_size=world_size,
-                world_local_size=world_local_size,
-                rank=rank,
-                node_rank=node_rank,
-                local_rank=local_rank,
-                device=device,
-            ),
-            *args,
-            **kwargs,
-        )
-    except Exception as ex:
-        print(ex)
-        traceback.print_exc()
-        raise
-    finally:
-        torch.distributed.destroy_process_group()
-
-
-def parallel_launch(
-    world_size: int,
-    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
-    *args: P.args,
-    **kwargs: P.kwargs,
-) -> None:
-    assert not kwargs
-    spawn(
-        _worker_parallel_launch,
-        args=(
-            world_size,
-            world_size,
-            0,
-            "tcp://localhost:29500",
-            worker,
-        ) + args,
-        nprocs=world_size,
-        join=True,
-    )
-
-
-def parallel_launch_from_env(
-    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
-    *args: P.args,
-    **kwargs: P.kwargs,
-) -> None:
-    """
-    Launches a worker function in parallel across all processes in the current
-    environment. The environment must have the following variables set:
-    - WORLD_SIZE: The total number of processes.
-    - WORLD_LOCAL_SIZE: The number of processes on the current node.
-    - NODE_RANK: The rank of the current
-    - MASTER_ADDR: The address of the master process.
-    - MASTER_PORT: The port of the master process.
-    """
-    assert not kwargs
-    world_size = int(os.environ["WORLD_SIZE"])
-    world_local_size = int(os.environ["WORLD_LOCAL_SIZE"])
-    node_rank = int(os.environ["NODE_RANK"])
-    assert "MASTER_ADDR" in os.environ
-    assert "MASTER_PORT" in os.environ
-    spawn(
-        _worker_parallel_launch,
-        args=(
-            world_size,
-            world_local_size,
-            node_rank,
-            "env://",
-            worker,
-        ) + args,
-        nprocs=world_local_size,
-        join=True,
-    )
-
 
 def torch_prepare(
     a: torch.Tensor,
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index 51bb29df0..c4d349f1a 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -632,7 +632,8 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
     ops.cutlass_moe_mm(out_tensors_stacked, a_tensors_stacked,
                        b_tensors_stacked, a_scales_tensors_stacked,
                        b_scales_tensors_stacked, expert_offsets[:-1],
-                       problem_sizes, ab_strides, ab_strides, c_strides)
+                       problem_sizes, ab_strides, ab_strides, c_strides,
+                       per_act_token, per_out_ch)
 
     # Validate each group's result against the baseline
     for g in range(num_experts):
diff --git a/tests/pplx_utils.py b/tests/pplx_utils.py
new file mode 100644
index 000000000..2d5d5be80
--- /dev/null
+++ b/tests/pplx_utils.py
@@ -0,0 +1,123 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+import os
+import traceback
+from typing import Callable
+
+import torch
+from torch.multiprocessing import (
+    spawn)  # pyright: ignore[reportPrivateImportUsage]
+from typing_extensions import Concatenate, ParamSpec
+
+P = ParamSpec("P")
+
+
+@dataclasses.dataclass
+class ProcessGroupInfo:
+    world_size: int
+    world_local_size: int
+    rank: int
+    node_rank: int
+    local_rank: int
+    device: torch.device
+
+
+def _worker_parallel_launch(
+    local_rank: int,
+    world_size: int,
+    world_local_size: int,
+    node_rank: int,
+    init_method: str,
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    rank = node_rank * world_local_size + local_rank
+    torch.cuda.set_device(local_rank)
+    device = torch.device("cuda", local_rank)
+    torch.distributed.init_process_group(
+        backend="cpu:gloo,cuda:nccl",
+        init_method=init_method,
+        rank=rank,
+        world_size=world_size,
+        device_id=device,
+    )
+    barrier = torch.tensor([rank], device=device)
+    torch.distributed.all_reduce(barrier)
+
+    try:
+        worker(
+            ProcessGroupInfo(
+                world_size=world_size,
+                world_local_size=world_local_size,
+                rank=rank,
+                node_rank=node_rank,
+                local_rank=local_rank,
+                device=device,
+            ),
+            *args,
+            **kwargs,
+        )
+    except Exception as ex:
+        print(ex)
+        traceback.print_exc()
+        raise
+    finally:
+        torch.distributed.destroy_process_group()
+
+
+def parallel_launch(
+    world_size: int,
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    assert not kwargs
+    spawn(
+        _worker_parallel_launch,
+        args=(
+            world_size,
+            world_size,
+            0,
+            "tcp://localhost:29500",
+            worker,
+        ) + args,
+        nprocs=world_size,
+        join=True,
+    )
+
+
+def parallel_launch_from_env(
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    """
+    Launches a worker function in parallel across all processes in the current
+    environment. The environment must have the following variables set:
+    - WORLD_SIZE: The total number of processes.
+    - WORLD_LOCAL_SIZE: The number of processes on the current node.
+    - NODE_RANK: The rank of the current
+    - MASTER_ADDR: The address of the master process.
+    - MASTER_PORT: The port of the master process.
+    """
+    assert not kwargs
+    world_size = int(os.environ["WORLD_SIZE"])
+    world_local_size = int(os.environ["WORLD_LOCAL_SIZE"])
+    node_rank = int(os.environ["NODE_RANK"])
+    assert "MASTER_ADDR" in os.environ
+    assert "MASTER_PORT" in os.environ
+    spawn(
+        _worker_parallel_launch,
+        args=(
+            world_size,
+            world_local_size,
+            node_rank,
+            "env://",
+            worker,
+        ) + args,
+        nprocs=world_local_size,
+        join=True,
+    )
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 14404cd73..92de1f5ef 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -899,11 +899,36 @@ def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor):
     return output_tensor
 
 
+def get_cutlass_pplx_moe_mm_data(expert_offsets: torch.Tensor,
+                                 problem_sizes1: torch.Tensor,
+                                 problem_sizes2: torch.Tensor,
+                                 expert_num_tokens: torch.Tensor,
+                                 num_local_experts: int, padded_m: int, n: int,
+                                 k: int):
+    """
+    Prepare data necessary to perform CUTLASS grouped matrix multiplications
+    used in CUTLASS-based fused MoE.
+
+    The function takes in expert_num_tokens (token count per expert) and
+    non_zero_expert_idxs (consecutive indices of experts with non-zero token 
+    counts) and uses them to compute:
+    - expert_offsets: Indices that mark at which token index each expert begins
+                      its computation.
+    - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's
+                                      multiplication in two grouped MMs used in
+                                      the fused MoE operation.
+    """
+    return torch.ops._C.get_cutlass_pplx_moe_mm_data(
+        expert_offsets, problem_sizes1, problem_sizes2, expert_num_tokens,
+        num_local_experts, padded_m, n, k)
+
+
 def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
                    b_tensors: torch.Tensor, a_scales: torch.Tensor,
                    b_scales: torch.Tensor, expert_offsets: torch.Tensor,
                    problem_sizes: torch.Tensor, a_strides: torch.Tensor,
-                   b_strides: torch.Tensor, c_strides: torch.Tensor):
+                   b_strides: torch.Tensor, c_strides: torch.Tensor,
+                   per_act_token: bool, per_out_ch: bool):
     """
     A single grouped matrix multiplication used in CUTLASS-based fused MoE.
     The function executes fp8-quantized OUT = AB matrix multiplication.
@@ -918,7 +943,7 @@ def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
     return torch.ops._C.cutlass_moe_mm(out_tensors, a_tensors, b_tensors,
                                        a_scales, b_scales, expert_offsets,
                                        problem_sizes, a_strides, b_strides,
-                                       c_strides)
+                                       c_strides, per_act_token, per_out_ch)
 
 
 def cutlass_fp4_moe_mm(a_tensors: torch.Tensor, b_tensors: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index a541d46b1..76d71ca08 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -39,6 +39,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
index 4db6b84e9..d62d519af 100644
--- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -67,6 +67,7 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
@@ -78,11 +79,11 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         # even if we fall back to triton later, e.g. if expert maps are set.
         if self.allow_deep_gemm and self.batched_deep_gemm_experts is not None:
             return self.batched_deep_gemm_experts.workspace_shapes(
-                a, M, N, K, topk, num_experts)
+                a, aq, M, N, K, topk, num_experts)
         else:
             assert self.batched_triton_experts is not None
             return self.batched_triton_experts.workspace_shapes(
-                a, M, N, K, topk, num_experts)
+                a, aq, M, N, K, topk, num_experts)
 
     def apply(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index e9446bc5f..6e7b1a4f2 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """ CUTLASS based Fused MoE kernels."""
-from typing import Optional
+from typing import Callable, Optional
 
 import torch
 
@@ -13,110 +13,109 @@ from vllm.model_executor.layers.fused_moe.utils import _fp8_perm, _resize_cache
 from vllm.scalar_type import scalar_types
 
 
-class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
-
-    def __init__(
-        self,
-        ab_strides1: torch.Tensor,
-        c_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides2: torch.Tensor,
-        out_dtype: torch.dtype,
-    ):
-        super().__init__()
-        self.ab_strides1 = ab_strides1
-        self.c_strides1 = c_strides1
-        self.ab_strides2 = ab_strides2
-        self.c_strides2 = c_strides2
-        self.out_dtype = out_dtype
-
-    def workspace_shapes(
-        self,
-        a: torch.Tensor,
-        M: int,
-        N: int,
-        K: int,
-        topk: int,
-        num_experts: int,
-    ) -> tuple[int, int, torch.dtype]:
-        # Note that K, N are transposed
-        N, K = K, N
-        workspace1 = M * topk * max(2 * N, K)
-        workspace2 = M * topk * N
-        return (workspace1, workspace2, self.out_dtype)
-
-    def apply(
-        self,
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_ids: torch.Tensor,
-        activation: str,
-        global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        w1_scale: Optional[torch.Tensor],
-        w2_scale: Optional[torch.Tensor],
-        w1_zp: Optional[torch.Tensor],
-        w2_zp: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
-        workspace13: torch.Tensor,
-        workspace2: torch.Tensor,
-        expert_num_tokens: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        a1q = hidden_states
-
-        assert w1_scale is not None
-        assert w2_scale is not None
-        assert w1.dtype == torch.float8_e4m3fn
-        assert w2.dtype == torch.float8_e4m3fn
-        assert a1q.shape[1] == w1.shape[1], "Hidden size mismatch w1"
-        assert w1.shape[2] == w2.shape[1] * 2, "Hidden size mismatch w2"
-        assert w1.shape[0] == w2.shape[0], "Expert number mismatch"
-        assert a1q_scale is None or a1q_scale.dim(
-        ) == 0 or a1q_scale.shape[0] == 1 or a1q_scale.shape[0] == a1q.shape[
-            0], "Input scale shape mismatch"
-        assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[
-            1] == w1.shape[2], "W1 scale shape mismatch"
-        assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[
-            1] == w2.shape[2], "W2 scale shape mismatch"
-        assert w1.shape[0] == w2.shape[0], "Weights expert number mismatch"
-        assert w1.shape[0] == w1_scale.shape[
-            0], "w1 scales expert number mismatch"
-        assert w1.shape[0] == w2_scale.shape[
-            0], "w2 scales expert number mismatch"
-        assert a2_scale is None or a1q_scale is None or a2_scale.shape == a1q_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
-        assert self.ab_strides1.shape[0] == w1.shape[
-            0], "AB Strides 1 expert number mismatch"
-        assert self.c_strides1.shape[0] == w1.shape[
-            0], "C Strides 1 expert number mismatch"
-        assert self.ab_strides2.shape[0] == w2.shape[
-            0], "AB Strides 2 expert number  mismatch"
-        assert self.c_strides2.shape[0] == w2.shape[
-            0], "C Strides 2 expert number mismatch"
-        assert self.out_dtype in [torch.half,
-                                  torch.bfloat16], "Invalid output dtype"
-
-        M = a1q.shape[0]
-        _, N, K = w2.shape  # because w1 + w2 are transposed
-        device = a1q.device
-
-        assert w1.shape[1] == K
-        assert global_num_experts != -1
-        assert a1q_scale is not None
-
-        if expert_map is not None:
-            "Translate info from expert_map to topk_ids"
-            local_topk_ids = torch.where(expert_map[topk_ids] != -1,
-                                         expert_map[topk_ids], -1)
-        else:
-            local_topk_ids = topk_ids
+def run_cutlass_moe_fp8(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation_callable: Callable,
+    global_num_experts: int,
+    expert_map: Optional[torch.Tensor],
+    w1_scale: Optional[torch.Tensor],
+    w2_scale: Optional[torch.Tensor],
+    a1q_scale: Optional[torch.Tensor],
+    a2_scale: Optional[torch.Tensor],
+    workspace13: torch.Tensor,
+    workspace2: torch.Tensor,
+    expert_num_tokens: Optional[torch.Tensor],
+    out_dtype: torch.dtype,
+    per_act_token: bool,
+    per_out_ch: bool,
+) -> torch.Tensor:
+    a1q = hidden_states
+
+    assert w1_scale is not None
+    assert w2_scale is not None
+    assert w1.dtype == torch.float8_e4m3fn
+    assert w2.dtype == torch.float8_e4m3fn
+    if expert_num_tokens is None:
+        assert a1q.shape[1] == w1.shape[2], "Hidden size mismatch w1"
+    else:
+        assert a1q.shape[2] == w1.shape[2], "Hidden size mismatch w1"
+    assert w1.shape[1] == w2.shape[2] * 2, "Hidden size mismatch w2"
+    assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[
+        1] == w1.shape[1], "W1 scale shape mismatch"
+    assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[
+        1] == w2.shape[1], "W2 scale shape mismatch"
+    assert w1.shape[0] == w2.shape[0], "Expert number mismatch"
+    assert a1q_scale is None or a1q_scale.dim(
+    ) == 0 or a1q_scale.shape[0] == 1 or a1q_scale.shape[0] == a1q.shape[
+        0], "Input scale shape mismatch"
+    assert w1.shape[0] == w2.shape[0], "Weights expert number mismatch"
+    assert w1.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
+    assert w1.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
+    assert a2_scale is None or a2_scale.dim(
+    ) == 0 or a2_scale.shape[0] == 1 or a2_scale.shape[0] == a1q.shape[
+        0], "Intermediate scale shape mismatch"
+    assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
+    if expert_map is not None:
+        assert expert_num_tokens is None
+
+    # We have two modes: PPLX and non-PPLX. We differentiate them by checking
+    # if expert_num_tokens is None (expert_num_tokens is a tensor which PPLX
+    # uses to track the number of tokens per expert).
+    # In the non-PPLX mode, the input tokens are not padded: thus, the shape
+    # of the input is [total_num_tokens, hidden_size]. The input and output
+    # require shuffling by a_map and c_map such that the tokens assigned to
+    # each expert are contiguous.
+    # In the PPLX mode, the input tokens are padded per expert to ensure that
+    # the PPLX dispatch and combine functions work correctly: thus, the shape
+    # of the input is [num_experts, max_num_tokens_per_expert, hidden_size].
+    # The PPLX input and output require no shuffling by a_map and c_map since
+    # their tokens are already contiguous for each expert as a result of
+    # the dispatch function.
+    is_pplx = expert_num_tokens is not None
+
+    M = a1q.shape[0]  # no pplx
+    padded_M = a1q.shape[1]  # pplx
+    _, K, N = w2.shape
+    device = a1q.device
+
+    assert w1.shape[2] == K
+    assert global_num_experts != -1
+    assert a1q_scale is not None
+
+    if expert_map is not None:
+        "Translate info from expert_map to topk_ids"
+        local_topk_ids = torch.where(expert_map[topk_ids] != -1,
+                                     expert_map[topk_ids], -1)
+    else:
+        local_topk_ids = topk_ids
+
+    topk = local_topk_ids.shape[1]
+    local_E = w1.shape[0]
+
+    if is_pplx:
+        expert_offsets = torch.empty((local_E),
+                                     dtype=torch.int32,
+                                     device=device)
+        problem_sizes1 = torch.empty((local_E, 3),
+                                     dtype=torch.int32,
+                                     device=device)
+        problem_sizes2 = torch.empty((local_E, 3),
+                                     dtype=torch.int32,
+                                     device=device)
 
-        topk = local_topk_ids.shape[1]
+        ops.get_cutlass_pplx_moe_mm_data(expert_offsets, problem_sizes1,
+                                         problem_sizes2, expert_num_tokens,
+                                         local_E, padded_M, N, K)
 
-        per_act_token = a1q_scale.numel() != 1 if a1q_scale is not None else (
-            a2_scale.numel() != 1 if a2_scale is not None else False)
+        w1_scale = w1_scale.reshape(w1_scale.shape[0], -1)
+        w2_scale = w2_scale.reshape(w2_scale.shape[0], -1)
+        a1q = a1q.reshape(-1, a1q.shape[2])
+        a1q_scale = a1q_scale.reshape(-1, a1q_scale.shape[2]).contiguous()
 
+    else:
         expert_offsets = torch.empty((global_num_experts + 1),
                                      dtype=torch.int32,
                                      device=device)
@@ -149,50 +148,130 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
 
         a1q = _fp8_perm(a1q, a_map)
         a1q_scale = a1q_scale[a_map] if per_act_token else a1q_scale
-
+        expert_offsets = expert_offsets[:-1]
+
+    ab_strides1 = torch.full((w1.shape[0], ),
+                             K,
+                             device=device,
+                             dtype=torch.int64)
+    c_strides1 = torch.full((w1.shape[0], ),
+                            2 * N,
+                            device=device,
+                            dtype=torch.int64)
+    ab_strides2 = torch.full((w1.shape[0], ),
+                             N,
+                             device=device,
+                             dtype=torch.int64)
+    c_strides2 = torch.full((w1.shape[0], ),
+                            K,
+                            device=device,
+                            dtype=torch.int64)
+
+    if is_pplx:
+        c1 = _resize_cache(workspace13, (local_E * padded_M, N * 2))
+        c2 = _resize_cache(workspace2, (local_E * padded_M, N))
+        c3 = _resize_cache(workspace13, (local_E * padded_M, K))
+    else:
         c1 = _resize_cache(workspace13, (M * topk, N * 2))
         c2 = _resize_cache(workspace2, (M * topk, N))
         c3 = _resize_cache(workspace13, (M * topk, K))
 
-        ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale,
-                           expert_offsets[:-1], problem_sizes1,
-                           self.ab_strides1, self.ab_strides1, self.c_strides1)
+    ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale, expert_offsets,
+                       problem_sizes1, ab_strides1, ab_strides1, c_strides1,
+                       per_act_token, per_out_ch)
 
-        self.activation(activation, c2, c1)
+    activation_callable(c2, c1)
 
-        a2q, a2q_scale = ops.scaled_fp8_quant(
-            c2, a2_scale, use_per_token_if_dynamic=per_act_token)
+    a2q, a2q_scale = ops.scaled_fp8_quant(
+        c2, a2_scale, use_per_token_if_dynamic=per_act_token)
 
-        if expert_map is not None:
-            c3.fill_(0)
+    if expert_map is not None:
+        c3.fill_(0)
+
+    ops.cutlass_moe_mm(c3, a2q, w2, a2q_scale, w2_scale, expert_offsets,
+                       problem_sizes2, ab_strides2, ab_strides2, c_strides2,
+                       per_act_token, per_out_ch)
 
-        ops.cutlass_moe_mm(c3, a2q, w2, a2q_scale, w2_scale,
-                           expert_offsets[:-1], problem_sizes2,
-                           self.ab_strides2, self.ab_strides2, self.c_strides2)
+    if is_pplx:
+        return c3.reshape(local_E, padded_M, K)
+    else:
+        return c3[c_map].view(M, topk, K)
 
-        c3 = c3[c_map]
 
-        return c3
+class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(
+        self,
+        max_experts_per_worker: int,
+        out_dtype: torch.dtype,
+        per_act_token: bool,
+        per_out_ch: bool,
+    ):
+        super().__init__()
+        self.max_experts_per_worker = max_experts_per_worker
+        self.out_dtype = out_dtype
+        self.per_act_token = per_act_token
+        self.per_out_ch = per_out_ch
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        num_experts: int,
+    ) -> tuple[int, int, torch.dtype]:
+        padded_M = aq.shape[1]
+        workspace1 = self.max_experts_per_worker * padded_M * max(N, K)
+        workspace2 = self.max_experts_per_worker * padded_M * (N // 2)
+        return (workspace1, workspace2, self.out_dtype)
+
+    def apply(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_num_tokens: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE"
+        assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE"
+        activation_callable = lambda i, o: self.activation(activation, i, o)
+        return run_cutlass_moe_fp8(hidden_states, w1, w2, topk_ids,
+                                   activation_callable, global_num_experts,
+                                   expert_map, w1_scale, w2_scale, a1q_scale,
+                                   a2_scale, workspace13, workspace2,
+                                   expert_num_tokens, self.out_dtype,
+                                   self.per_act_token, self.per_out_ch)
 
 
-#TODO make the grouped gemm kernel consistent with scaled gemm kernel
 def cutlass_moe_fp8(
     a: torch.Tensor,
     w1_q: torch.Tensor,
     w2_q: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
-    ab_strides1: torch.Tensor,
-    c_strides1: torch.Tensor,
-    ab_strides2: torch.Tensor,
-    c_strides2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    activation: str = "silu",
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
-    out_dtype: torch.dtype = torch.half,
     expert_map: Optional[torch.Tensor] = None,
     apply_router_weight_on_input: bool = False,
+    global_num_experts: int = -1,
 ) -> torch.Tensor:
     """
     This function computes a a8w8-quantized Mixture of Experts (MoE) layer
@@ -207,25 +286,17 @@ def cutlass_moe_fp8(
         Shape: [num_experts, K, 2N] (the weights are passed transposed)
     - w2_q (torch.Tensor): The second set of fp8-quantized expert weights.
         Shape: [num_experts, N, K] (the weights are passed transposed)
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - topk_ids (torch.Tensor): The token->expert mappings.
     - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
         Shape: [num_experts] or [num_experts, 2N]
     - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
         Shape: [num_experts] or [num_experts, K]
-    - gating_output (torch.Tensor): The output of the gating operation
-        (before softmax).
-    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
-    - ab_strides1 (torch.Tensor): The input and weights strides of the first
-        grouped gemm.
-    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
-    - ab_strides2 (torch.Tensor): The input and weights strides of the second
-        grouped gemm.
-    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
     - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
         Shape: scalar or [M]
     - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
         quantize the intermediate result between the gemms.
         Shape: scalar or [M]
-    - out_dtype (torch.dtype): The output tensor type.
     - expert_map (Optional[torch.Tensor]): In the case of Expert parallel,
         every Rank is responsible for a subset of experts. expert_map is a
         mapping from global expert-id to local expert-id. When expert_map[i]
@@ -233,24 +304,27 @@ def cutlass_moe_fp8(
         expert-id i.
     - apply_router_weight_on_input (bool): When true, the topk weights are
         applied directly on the inputs. This is only applicable when topk is 1.
+    - global_num_experts (int): The total number of experts.
 
     Returns:
     - torch.Tensor: The fp16 output tensor after applying the MoE layer.
     """
     per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
         a2_scale.numel() != 1 if a2_scale is not None else False)
+    per_out_ch = w1_scale.numel() != w1_q.shape[0]
+
+    out_dtype = a.dtype
 
     fn = mk.FusedMoEModularKernel(
         MoEPrepareAndFinalizeNoEP(
-            per_channel_quant=per_act_token,
             quant_dtype=torch.float8_e4m3fn,
+            per_channel_quant=per_act_token,
         ),
         CutlassExpertsFp8(
-            ab_strides1,
-            c_strides1,
-            ab_strides2,
-            c_strides2,
-            out_dtype,
+            max_experts_per_worker=global_num_experts,
+            out_dtype=out_dtype,
+            per_act_token=per_act_token,
+            per_out_ch=per_out_ch,
         ),
     )
 
@@ -260,9 +334,12 @@ def cutlass_moe_fp8(
         w2_q,
         topk_weights,
         topk_ids,
-        expert_map=expert_map,
-        w1_scale=w1_scale,
-        w2_scale=w2_scale,
+        False,
+        activation,
+        global_num_experts if global_num_experts != -1 else w1_q.size(0),
+        expert_map,
+        w1_scale,
+        w2_scale,
         a1_scale=a1_scale,
         a2_scale=a2_scale,
         apply_router_weight_on_input=apply_router_weight_on_input,
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 97b4a49c0..c00e849b4 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -73,6 +73,7 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index 7490a192d..68a3485ff 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -521,6 +521,7 @@ class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
@@ -632,6 +633,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index de7a9a8d0..ba1498e65 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1545,6 +1545,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 1812f3b67..cf8e4ee65 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -9,6 +9,9 @@ from typing import Callable, Optional, Union
 
 import torch
 import torch.nn.functional as F
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy,
+                                             QuantizationType)
 from torch.nn.parameter import UninitializedParameter
 
 import vllm.envs as envs
@@ -210,6 +213,7 @@ class MoEConfig:
     moe_parallel_config: FusedMoEParallelConfig
 
     in_dtype: torch.dtype  # The activation type.
+    quant_dtype: torch.dtype = None
 
     # TODO: add more quantization params, blocked, per-token, etc.
     block_size: int = 128
@@ -264,8 +268,22 @@ class FusedMoeWeightScaleSupported(Enum):
     BLOCK = "block"
 
 
+def get_quant_config_input_activations(
+        quant_config: Optional[QuantizationConfig]
+) -> Optional[QuantizationArgs]:
+    if (quant_config is not None and hasattr(quant_config, 'target_scheme_map')
+            and "Linear" in quant_config.target_scheme_map and
+            "input_activations" in quant_config.target_scheme_map["Linear"]):
+        return quant_config.target_scheme_map["Linear"].get(
+            "input_activations")
+    else:
+        return None
+
+
 class FusedMoEMethodBase(QuantizeMethodBase):
 
+    moe: MoEConfig
+
     @abstractmethod
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
@@ -277,6 +295,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         all2all_manager = get_ep_group().device_communicator.all2all_manager
         assert all2all_manager is not None
 
+        self.moe = moe
         quant_dtype = None
         act_quant_block_size = None
         from vllm.model_executor.layers.quantization.fp8 import Fp8Config
@@ -297,13 +316,14 @@ class FusedMoEMethodBase(QuantizeMethodBase):
                 # dp_size actually means tp_size, bug in pplx kernels
                 dp_size=all2all_manager.tp_group.world_size,
                 hidden_dim=moe.hidden_dim,
-                hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize,
+                hidden_dim_bytes=moe.hidden_dim * moe.quant_dtype.itemsize,
                 # For blocked per token: set to
                 #   ceil_div(hidden_dim, block_size) * sizeof(float32)
                 # For per-token: set to sizeof(float32)
-                hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else (
-                    (moe.hidden_dim + moe.block_size - 1) // moe.block_size *
-                    torch.float32.itemsize)),
+                hidden_dim_scale_bytes=(
+                    0 if moe.quant_dtype.itemsize != 1 else
+                    ((moe.hidden_dim + moe.block_size - 1) // moe.block_size *
+                     torch.float32.itemsize)),
             )
 
             # Intranode pplx a2a takes a group name while internode does not.
@@ -313,6 +333,9 @@ class FusedMoEMethodBase(QuantizeMethodBase):
 
             handle = all2all_manager.get_handle(all_to_all_args)
 
+            input_activations = get_quant_config_input_activations(
+                quant_config)
+
             prepare_finalize = PplxPrepareAndFinalize(
                 handle,
                 max_num_tokens=moe.max_num_tokens,
@@ -320,7 +343,10 @@ class FusedMoEMethodBase(QuantizeMethodBase):
                 rank=all2all_manager.rank,
                 # dp_size actually means tp_size, bug in pplx kernels
                 dp_size=all2all_manager.tp_group.world_size,
-                quant_dtype=moe.in_dtype,
+                quant_dtype=moe.quant_dtype,
+                per_act_token=(input_activations.strategy
+                               == QuantizationStrategy.TOKEN
+                               if input_activations is not None else False),
             )
         elif moe.use_deepep_ht_kernels:
             assert moe.dp_size == all2all_manager.dp_world_size
@@ -365,15 +391,15 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         self.topk_indices_dtype = None
         if prepare_finalize is not None:
             self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
-            experts = self.select_gemm_impl(prepare_finalize)
+            experts = self.select_gemm_impl(prepare_finalize, moe)
             self.fused_experts = FusedMoEModularKernel(
                 prepare_finalize,
                 experts,
             )
 
     def select_gemm_impl(
-        self, prepare_finalize: FusedMoEPrepareAndFinalize
-    ) -> FusedMoEPermuteExpertsUnpermute:
+            self, prepare_finalize: FusedMoEPrepareAndFinalize,
+            moe: Optional[MoEConfig]) -> FusedMoEPermuteExpertsUnpermute:
         # based on the all2all implementation, select the appropriate
         # gemm implementation
         raise NotImplementedError(
@@ -419,7 +445,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         else:
             self.rocm_aiter_fused_experts = None  # type: ignore
 
-    def select_gemm_impl(self, prepare_finalize: FusedMoEPrepareAndFinalize):
+    def select_gemm_impl(self, prepare_finalize: FusedMoEPrepareAndFinalize,
+                         moe: Optional[MoEConfig]):
 
         assert self.fused_experts == fused_experts
 
@@ -809,7 +836,6 @@ class FusedMoE(torch.nn.Module):
         activation: str = "silu",
     ):
         super().__init__()
-
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
         self.params_dtype = params_dtype
@@ -869,14 +895,24 @@ class FusedMoE(torch.nn.Module):
             from vllm_hpu_extension.ops import DynamicFusedMOE
             self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts)
 
+        # Only support float8 for now.
+        quant_dtype = params_dtype
+        if quant_config is not None:
+            input_activations = get_quant_config_input_activations(
+                quant_config)
+            if (input_activations is not None
+                    and input_activations.num_bits == 8
+                    and input_activations.type == QuantizationType.FLOAT):
+                quant_dtype = torch.float8_e4m3fn
+
         moe = MoEConfig(
             num_experts=self.global_num_experts,
             experts_per_token=top_k,
             hidden_dim=hidden_size,
             num_local_experts=self.local_num_experts,
             moe_parallel_config=self.moe_parallel_config,
-            # TODO (bnell): this needs to be fixed for quantized types.
             in_dtype=params_dtype,
+            quant_dtype=quant_dtype,
             max_num_tokens=MOE_DP_CHUNK_SIZE,
         )
         self.moe_config = moe
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 2c27d31eb..e7aaf62fb 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -175,6 +175,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
@@ -309,7 +310,7 @@ class FusedMoEModularKernel(torch.nn.Module):
 
         # Use a1 here to decipher the correct workspace datatype
         workspace13_shape, workspace2_shape, workspace_dtype = (
-            self.fused_experts.workspace_shapes(a1, M, N, K, top_k,
+            self.fused_experts.workspace_shapes(a1, a1q, M, N, K, top_k,
                                                 global_num_experts))
 
         # We can reuse the memory between cache1 and cache3 because by the time
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index 1170a16f3..5bc01dbf2 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -21,7 +21,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                  rank: int,
                  dp_size: int,
                  quant_dtype: Optional[torch.dtype] = None,
-                 block_shape: Optional[list[int]] = None):
+                 block_shape: Optional[list[int]] = None,
+                 per_act_token: bool = False):
         super().__init__()
         assert max_num_tokens > 0
         self.a2a = a2a
@@ -31,6 +32,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         self.rank = rank
         self.dp_size = dp_size
         self.quant_dtype = quant_dtype
+        self.per_act_token = per_act_token
 
     def max_num_tokens_per_rank(self) -> Optional[int]:
         return self.max_num_tokens
@@ -66,13 +68,14 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 "apply_router_weight_on_input is only implemented for topk=1")
             a1 = a1 * rank_topk_weights.to(a1.dtype)
 
-        per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
-            a2_scale.numel() != 1 if a2_scale is not None else False)
+        repeat_cols = 4
+        repeat_rows = 1 if self.per_act_token else a1.shape[0]
+        a1q, a1q_scale = moe_kernel_quantize_input(
+            a1, (None if self.per_act_token else a1_scale), self.quant_dtype,
+            self.per_act_token, self.block_shape)
 
-        a1q, a1q_scale = moe_kernel_quantize_input(a1, a1_scale,
-                                                   self.quant_dtype,
-                                                   per_act_token,
-                                                   self.block_shape)
+        if a1q_scale is not None:
+            a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols)
 
         # rem_experts need to be 0 for pplx to work properly.
         rem_experts = num_experts % self.world_size
@@ -100,7 +103,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                           else 1) * float32_size
             expert_x_scale = torch.empty(
                 (
-                    num_experts,
+                    num_local_experts,
                     expert_x.size(1),
                     (expert_x.size(2) + block_size - 1) // block_size,
                 ),
@@ -121,6 +124,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             indices=rank_topk_ids,
             bound_m=bound_m,
         )
+        if expert_x_scale is not None:
+            expert_x_scale = expert_x_scale[:, :, 0:1]
 
         return expert_x, expert_x_scale, expert_num_tokens, None, None
 
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index 920931a93..87de29444 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -37,6 +37,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
@@ -49,9 +50,9 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         if self.allow_deep_gemm and _valid_deep_gemm_shape(M, N, K):
             assert self.deep_gemm_expert is not None
             return self.deep_gemm_expert.workspace_shapes(
-                a, M, N, K, topk, num_experts)
+                a, aq, M, N, K, topk, num_experts)
         else:
-            return self.triton_expert.workspace_shapes(a, M, N, K, topk,
+            return self.triton_expert.workspace_shapes(a, aq, M, N, K, topk,
                                                        num_experts)
 
     def apply(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index ebb029572..bc9d399cf 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
+import importlib
 from enum import Enum
 from typing import Callable, Optional
 
@@ -11,7 +12,6 @@ from compressed_tensors.quantization import (ActivationOrdering,
                                              QuantizationStrategy)
 
 import vllm.envs as envs
-import vllm.model_executor.layers.fused_moe  # noqa
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
@@ -30,6 +30,15 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
+has_pplx = importlib.util.find_spec("pplx_kernels") is not None
+
+if current_platform.is_cuda_alike():
+    from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+        BatchedPrepareAndFinalize)
+    if has_pplx:
+        from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+            PplxPrepareAndFinalize)
+
 logger = init_logger(__name__)
 
 
@@ -77,8 +86,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
             else:
                 logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
                 return CompressedTensorsWNA16MarlinMoEMethod(quant_config)
-        elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
-              and layer.activation == "silu"):
+        elif quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant):
             return CompressedTensorsW8A8Fp8MoECutlassMethod(quant_config)
         elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
             return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
@@ -421,6 +429,11 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
                 "For FP8 Fused MoE layer, we require either per tensor or "
                 "channelwise, dynamic per token quantization.")
 
+        from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+            cutlass_moe_fp8)
+        self.fused_experts = cutlass_moe_fp8  # type: ignore
+        self.disable_expert_map = False
+
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
@@ -499,25 +512,6 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
             layer.w13_input_scale = None
             layer.w2_input_scale = None
 
-        device = w13_weight.device
-        # TODO strides can be shared across multiple layers
-        self.ab_strides1 = torch.full((num_experts, ),
-                                      hidden_size,
-                                      device=device,
-                                      dtype=torch.int64)
-        self.c_strides1 = torch.full((num_experts, ),
-                                     2 * intermediate_size_per_partition,
-                                     device=device,
-                                     dtype=torch.int64)
-        self.ab_strides2 = torch.full((num_experts, ),
-                                      intermediate_size_per_partition,
-                                      device=device,
-                                      dtype=torch.int64)
-        self.c_strides2 = torch.full((num_experts, ),
-                                     hidden_size,
-                                     device=device,
-                                     dtype=torch.int64)
-
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Fp8 moe kernels require a single activation scale.
         # We take the max of all the scales in case they differ.
@@ -558,6 +552,27 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
             layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
                                                         requires_grad=False)
 
+    def select_gemm_impl(self, prepare_finalize, moe):
+        from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+            CutlassExpertsFp8)
+
+        assert moe is not None
+
+        max_experts_per_worker = (
+            (moe.num_experts + prepare_finalize.world_size - 1) //
+            prepare_finalize.world_size)
+        experts = CutlassExpertsFp8(
+            max_experts_per_worker, moe.in_dtype,
+            self.input_quant.strategy == QuantizationStrategy.TOKEN,
+            self.weight_quant.strategy == QuantizationStrategy.CHANNEL)
+
+        if has_pplx and isinstance(
+                prepare_finalize,
+            (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)):
+            # no expert_map support in this case
+            self.disable_expert_map = True
+        return experts
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -577,9 +592,6 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
         activation: str = "silu",
     ) -> torch.Tensor:
 
-        assert activation == "silu", (
-            f"{activation} not supported for Cutlass MoE.")
-
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -590,27 +602,22 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
-
-        from vllm.model_executor.layers.fused_moe import cutlass_moe_fp8
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=torch.uint32)
 
-        return cutlass_moe_fp8(
+        return self.fused_experts(
             x,
-            layer.w13_weight.transpose(1, 2),
-            layer.w2_weight.transpose(1, 2),
-            layer.w13_weight_scale,
-            layer.w2_weight_scale,
+            layer.w13_weight,
+            layer.w2_weight,
             topk_weights,
             topk_ids,
-            self.ab_strides1,
-            self.c_strides1,
-            self.ab_strides2,
-            self.c_strides2,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=None if self.disable_expert_map else expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
-            out_dtype=x.dtype,
-            expert_map=expert_map,
-            apply_router_weight_on_input=apply_router_weight_on_input,
         )
 
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 5ac22b6a0..c785e0d16 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -769,7 +769,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-    def select_gemm_impl(self, prepare_finalize):
+    def select_gemm_impl(self, prepare_finalize, moe):
 
         from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
             BatchedTritonOrDeepGemmExperts)
-- 
GitLab


From 66c508b137acbd2d82590cbbf4d144e8c51271d1 Mon Sep 17 00:00:00 2001
From: QiliangCui <derrhein@gmail.com>
Date: Fri, 6 Jun 2025 20:10:24 -0700
Subject: [PATCH 223/274] [TPU][Test] Add script to run benchmark on TPU for
 buildkite (#19039)

Signed-off-by: Qiliang Cui <derrhein@gmail.com>
---
 .buildkite/scripts/tpu/cleanup_docker.sh |  24 ++++++
 .buildkite/scripts/tpu/config_v6e_1.env  |  14 ++++
 .buildkite/scripts/tpu/docker_run_bm.sh  | 102 +++++++++++++++++++++++
 .buildkite/scripts/tpu/run_bm.sh         |  94 +++++++++++++++++++++
 4 files changed, 234 insertions(+)
 create mode 100755 .buildkite/scripts/tpu/cleanup_docker.sh
 create mode 100644 .buildkite/scripts/tpu/config_v6e_1.env
 create mode 100755 .buildkite/scripts/tpu/docker_run_bm.sh
 create mode 100755 .buildkite/scripts/tpu/run_bm.sh

diff --git a/.buildkite/scripts/tpu/cleanup_docker.sh b/.buildkite/scripts/tpu/cleanup_docker.sh
new file mode 100755
index 000000000..209d9c434
--- /dev/null
+++ b/.buildkite/scripts/tpu/cleanup_docker.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -euo pipefail
+
+docker_root=$(docker info -f '{{.DockerRootDir}}')
+if [ -z "$docker_root" ]; then
+  echo "Failed to determine Docker root directory."
+  exit 1
+fi
+echo "Docker root directory: $docker_root"
+# Check disk usage of the filesystem where Docker's root directory is located
+disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+# Define the threshold
+threshold=70
+if [ "$disk_usage" -gt "$threshold" ]; then
+  echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+  # Remove dangling images (those that are not tagged and not used by any container)
+  docker image prune -f
+  # Remove unused volumes / force the system prune for old images as well.
+  docker volume prune -f && docker system prune --force --filter "until=72h" --all
+  echo "Docker images and volumes cleanup completed."
+else
+  echo "Disk usage is below $threshold%. No cleanup needed."
+fi
diff --git a/.buildkite/scripts/tpu/config_v6e_1.env b/.buildkite/scripts/tpu/config_v6e_1.env
new file mode 100644
index 000000000..441758647
--- /dev/null
+++ b/.buildkite/scripts/tpu/config_v6e_1.env
@@ -0,0 +1,14 @@
+# Environment config
+TEST_NAME=llama8b
+CONTAINER_NAME=vllm-tpu
+
+# vllm config
+MODEL=meta-llama/Llama-3.1-8B-Instruct
+MAX_NUM_SEQS=512
+MAX_NUM_BATCHED_TOKENS=512
+TENSOR_PARALLEL_SIZE=1
+MAX_MODEL_LEN=2048
+DOWNLOAD_DIR=/mnt/disks/persist
+EXPECTED_THROUGHPUT=8.0
+INPUT_LEN=1800
+OUTPUT_LEN=128
diff --git a/.buildkite/scripts/tpu/docker_run_bm.sh b/.buildkite/scripts/tpu/docker_run_bm.sh
new file mode 100755
index 000000000..6705da03e
--- /dev/null
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+if [ ! -f "$1" ]; then
+  echo "Error: The env file '$1' does not exist."
+  exit 1  # Exit the script with a non-zero status to indicate an error
+fi
+
+ENV_FILE=$1
+
+# For testing on local vm, use `set -a` to export all variables
+source /etc/environment
+source $ENV_FILE
+
+remove_docker_container() { 
+    docker rm -f tpu-test || true; 
+    docker rm -f vllm-tpu || true;
+    docker rm -f $CONTAINER_NAME || true;
+}
+
+trap remove_docker_container EXIT
+
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# Build docker image.
+# TODO: build the image outside the script and share the image with other
+# tpu test if building time is too long.
+DOCKER_BUILDKIT=1 docker build \
+  --build-arg max_jobs=16 \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg GIT_REPO_CHECK=0 \
+  --tag vllm/vllm-tpu-bm \
+  --progress plain -f docker/Dockerfile.tpu .
+
+LOG_ROOT=$(mktemp -d)
+# If mktemp fails, set -e will cause the script to exit.
+echo "Results will be stored in: $LOG_ROOT"
+
+if [ -z "$HF_TOKEN" ]; then
+  echo "Error: HF_TOKEN is not set or is empty."  
+  exit 1
+fi
+
+# Make sure mounted disk or dir exists
+if [ ! -d "$DOWNLOAD_DIR" ]; then
+    echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
+    exit 1
+fi
+
+echo "Run model $MODEL"
+echo
+
+echo "starting docker...$CONTAINER_NAME"
+echo    
+docker run \
+ -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
+ --env-file $ENV_FILE \
+ -e HF_TOKEN="$HF_TOKEN" \
+ -e TARGET_COMMIT=$BUILDKITE_COMMIT \
+ -e MODEL=$MODEL \
+ -e WORKSPACE=/workspace \
+ --name $CONTAINER_NAME \
+ -d \
+ --privileged \
+ --network host \
+ -v /dev/shm:/dev/shm \
+ vllm/vllm-tpu-bm tail -f /dev/null
+
+echo "run script..."
+echo
+docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
+
+echo "copy result back..."
+VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
+BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
+docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG" 
+docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
+
+throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
+echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
+
+if [ "$BUILDKITE" = "true" ]; then
+  echo "Running inside Buildkite"
+  buildkite-agent artifact upload "$VLLM_LOG" 
+  buildkite-agent artifact upload "$BM_LOG"
+else
+  echo "Not running inside Buildkite"
+fi
+
+#
+# compare the throughput with EXPECTED_THROUGHPUT 
+# and assert meeting the expectation
+# 
+if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
+  echo "Failed to get the throughput"
+  exit 1
+fi
+
+if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
+  echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
+  exit 1
+fi
diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh
new file mode 100755
index 000000000..877669cd9
--- /dev/null
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+set -euo pipefail
+
+VLLM_LOG="$WORKSPACE/vllm_log.txt"
+BM_LOG="$WORKSPACE/bm_log.txt"
+
+if [ -n "$TARGET_COMMIT" ]; then
+  head_hash=$(git rev-parse HEAD)
+  if [ "$TARGET_COMMIT" != "$head_hash" ]; then
+    echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
+    exit 1
+  fi
+fi
+
+echo "model: $MODEL"
+echo
+
+#
+# create a log folder
+#
+mkdir "$WORKSPACE/log"
+
+# TODO: Move to image building.
+pip install pandas
+pip install datasets
+
+#
+# create sonnet_4x
+#
+echo "Create sonnet_4x.txt"
+echo "" > benchmarks/sonnet_4x.txt
+for _ in {1..4}
+ do
+  cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
+done
+
+#
+# start vllm service in backend
+#
+echo "lanching vllm..."
+echo "logging to $VLLM_LOG"
+echo
+
+VLLM_USE_V1=1 vllm serve $MODEL \
+ --seed 42 \
+ --disable-log-requests \
+ --max-num-seqs $MAX_NUM_SEQS \
+ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
+ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
+ --no-enable-prefix-caching \
+ --download_dir $DOWNLOAD_DIR \
+ --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
+
+
+echo "wait for 20 minutes.."
+echo
+# sleep 1200
+# wait for 10 minutes...
+for i in {1..120}; do
+    # TODO: detect other type of errors.
+    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
+        echo "Detected RuntimeError, exiting."
+        exit 1
+    elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
+        echo "Application started"
+        break
+    else
+        echo "wait for 10 seconds..."
+        sleep 10
+    fi
+done
+
+#
+# run test
+#
+echo "run benchmark test..."
+echo "logging to $BM_LOG"
+echo
+python benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --model $MODEL  \
+    --dataset-name sonnet \
+    --dataset-path benchmarks/sonnet_4x.txt \
+    --sonnet-input-len $INPUT_LEN \
+    --sonnet-output-len $OUTPUT_LEN \
+    --ignore-eos > "$BM_LOG"
+
+echo "completed..."
+echo
+
+throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
+echo "throughput: $throughput"
+echo
-- 
GitLab


From c4296b1a273ff2f980a196f350d5df0035086418 Mon Sep 17 00:00:00 2001
From: Aaruni Aggarwal <47731267+AaruniAggarwal@users.noreply.github.com>
Date: Sat, 7 Jun 2025 09:22:52 +0530
Subject: [PATCH 224/274] [CI][PowerPC] Use a more appropriate way to select
 testcase in tests/models/language/pooling/test_embedding.py (#19253)

Signed-off-by: Aaruni Aggarwal <aaruniagg@gmail.com>
---
 .buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 8f9b18437..36bcb015d 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -38,7 +38,7 @@ function cpu_tests() {
     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
     pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
+    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
 }
 
 # All of CPU tests are expected to be finished less than 40 mins.
-- 
GitLab


From cf02f9b283a6a85f872e87b57dbeff7b767715e5 Mon Sep 17 00:00:00 2001
From: Driss Guessous <32754868+drisspg@users.noreply.github.com>
Date: Sat, 7 Jun 2025 00:58:55 -0400
Subject: [PATCH 225/274] Add FlexAttention to V1 (#16078)

Signed-off-by: drisspg <drisspguessous@gmail.com>
---
 tests/kernels/test_flex_attention.py         |  93 ++++
 vllm/engine/arg_utils.py                     |   1 +
 vllm/platforms/cuda.py                       |   3 +
 vllm/platforms/interface.py                  |   1 +
 vllm/v1/attention/backends/flex_attention.py | 477 +++++++++++++++++++
 5 files changed, 575 insertions(+)
 create mode 100644 tests/kernels/test_flex_attention.py
 create mode 100644 vllm/v1/attention/backends/flex_attention.py

diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py
new file mode 100644
index 000000000..040ddac10
--- /dev/null
+++ b/tests/kernels/test_flex_attention.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Integration tests for FlexAttention backend vs default backend"""
+
+import random
+
+import numpy as np
+import pytest
+import torch
+from packaging import version
+
+from vllm import LLM, SamplingParams
+
+TORCH_VERSION = version.parse(torch.__version__)
+MINIMUM_TORCH_VERSION = version.parse("2.7.0")
+
+
+def set_seed(seed):
+    """Set seeds for reproducibility"""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
+    reason="CUDA not available or PyTorch version < 2.7",
+)
+def test_flex_attention_vs_default_backend(monkeypatch):
+    """Test that FlexAttention produces the same outputs as the default backend.
+
+    This test compares the outputs from the FlexAttention backend with
+    the default backend, ensuring they are identical when using the same seed.
+    """
+    model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+    seed = 42
+    max_tokens = 32
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+    ]
+
+    sampling_params = SamplingParams(temperature=0.0,
+                                     top_p=1.0,
+                                     seed=seed,
+                                     max_tokens=max_tokens)
+
+    # Run with flex attention
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+        set_seed(seed)
+
+        llm_flex = LLM(
+            model_name,
+            tensor_parallel_size=1,
+            num_gpu_blocks_override=128,
+            enforce_eager=True,
+        )
+        output_flex = llm_flex.generate(prompts, sampling_params)
+
+    # Run with default backend
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+        set_seed(seed)
+        llm_default = LLM(
+            model_name,
+            tensor_parallel_size=1,
+            num_gpu_blocks_override=128,
+            enforce_eager=True,
+        )
+        output_default = llm_default.generate(prompts, sampling_params)
+
+    # Compare outputs from both backends
+    for i, (flex_result,
+            default_result) in enumerate(zip(output_flex, output_default)):
+        prompt = prompts[i]
+        flex_text = flex_result.outputs[0].text
+        default_text = default_result.outputs[0].text
+
+        assert flex_text == default_text, (
+            f"FlexAttention output doesn't match default for: {prompt!r}\n"
+            f"FlexAttention: {flex_text!r}\n"
+            f"Default: {default_text!r}")
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 81f160968..4ce1b41e4 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1409,6 +1409,7 @@ class EngineArgs:
             "FLASHINFER_VLLM_V1",
             "ROCM_AITER_MLA",
             "TORCH_SDPA_VLLM_V1",
+            "FLEX_ATTENTION",
         ]
         if (envs.is_set("VLLM_ATTENTION_BACKEND")
                 and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 8ad66776c..48d1aacba 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -226,6 +226,9 @@ class CudaPlatformBase(Platform):
             if selected_backend == _Backend.FLASHINFER:
                 logger.info_once("Using FlashInfer backend on V1 engine.")
                 return "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
+            if selected_backend == _Backend.FLEX_ATTENTION:
+                logger.info("Using FlexAttenion backend on V1 engine.")
+                return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
             if selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
                 logger.info_once("Using Triton backend on V1 engine.")
                 return ("vllm.v1.attention.backends."
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 08d7aa175..f91f222b2 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -60,6 +60,7 @@ class _Backend(enum.Enum):
     BLOCK_SPARSE_FLASH_ATTN = enum.auto()
     DUAL_CHUNK_FLASH_ATTN = enum.auto()
     NO_ATTENTION = enum.auto()
+    FLEX_ATTENTION = enum.auto()
 
 
 class PlatformEnum(enum.Enum):
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
new file mode 100644
index 000000000..5b473b146
--- /dev/null
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -0,0 +1,477 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Attention layer with FlashAttention."""
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+from torch.nn.attention.flex_attention import (BlockMask, _mask_mod_signature,
+                                               _score_mod_signature,
+                                               create_block_mask,
+                                               flex_attention)
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata, AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.worker.block_table import BlockTable
+
+if current_platform.is_cuda():
+    pass
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
+create_block_mask_compiled = torch.compile(create_block_mask,
+                                           fullgraph=True,
+                                           mode="reduce-overhead")
+flex_attention_compiled = torch.compile(flex_attention, fullgraph=True)
+
+
+def _offsets_to_doc_ids_tensor(offsets: torch.Tensor) -> torch.Tensor:
+    device = offsets.device
+    counts = offsets[1:] - offsets[:-1]
+    return torch.repeat_interleave(
+        torch.arange(len(counts), device=device, dtype=torch.int32), counts)
+
+
+class FlexAttentionBackend(AttentionBackend):
+    accept_output_buffer: bool = True
+
+    @staticmethod
+    def get_supported_head_sizes() -> list[int]:
+        return [16, 32, 64, 96, 128, 160, 192, 224, 256]
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLEX_ATTENTION"
+
+    @staticmethod
+    def get_impl_cls() -> type["FlexAttentionImpl"]:
+        return FlexAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return FlexAttentionMetadata
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        return (2, num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def get_builder_cls() -> type["FlexAttentionMetadataBuilder"]:
+        return FlexAttentionMetadataBuilder
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+
+# @torch.compile(fullgraph=True, mode="reduce-overhead")
+def physical_to_logical_mapping(
+        block_table: torch.Tensor,
+        total_blocks: Optional[int] = None) -> torch.Tensor:
+    """
+    Creates an inverse mapping from physical block locations to logical indices.
+
+    The original block_table maps from logical blocks to physical locations:
+
+    Logical to Physical (Original block_table):
+    ┌───────────────────────────────────────────┐
+    │ Request 0:                                │
+    │                                           │
+    │ Logical Blocks:  0  1  2  3  4  5  6  7   │
+    │                  │  │  │  │  │  │  │  │   │
+    │                  v  v  v  v  v  v  v  v   │
+    │ Physical Blocks: 3  5  1  7  4  2  0  6   │
+    └───────────────────────────────────────────┘
+
+    This function creates the inverse mapping:
+
+    Physical to Logical (Inverse mapping):
+    ┌───────────────────────────────────────────┐
+    │ Request 0:                                │
+    │                                           │
+    │ Physical Blocks: 0  1  2  3  4  5  6  7   │
+    │                  │  │  │  │  │  │  │  │   │
+    │                  v  v  v  v  v  v  v  v   │
+    │ Logical Blocks:  6  2  5  0  4  1  7  3   │
+    └───────────────────────────────────────────┘
+
+    If multiple logical blocks map to the same physical block,
+    this function returns the first (minimum) logical block index.
+
+    If a physical block is not mapped to by any logical block,
+    its value in the result will be -1.
+
+
+    Args:
+        block_table: Tensor of shape [max_reqs, max_num_blocks]
+            mapping logical blocks to physical locations
+
+    Returns:
+        A tensor of shape [max_reqs, max_physical_block]
+    """
+    max_reqs, max_num_blocks = block_table.shape
+    device = block_table.device
+
+    physical_to_logical = torch.full((max_reqs, total_blocks),
+                                     -1,
+                                     dtype=torch.long,
+                                     device=device)
+
+    logical_indices = (torch.arange(max_num_blocks,
+                                    device=device).unsqueeze(0).expand(
+                                        max_reqs, -1))
+
+    physical_to_logical.scatter_(-1, block_table.to(torch.int64),
+                                 logical_indices)
+    # TODO Confirm - Seems like block 0 is always empty so we reset it manually
+    physical_to_logical[:, 0] = -1
+    return physical_to_logical
+
+
+def causal_mask_mod(b: torch.Tensor, h: torch.Tensor, q_idx: torch.Tensor,
+                    kv_idx: torch.Tensor):
+    return q_idx >= kv_idx
+
+
+@dataclass
+class FlexAttentionMetadata:
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    block_table: torch.Tensor
+    slot_mapping: torch.Tensor
+
+    use_cascade: bool
+    common_prefix_len: int
+    cu_prefix_query_lens: Optional[torch.Tensor]
+    prefix_kv_lens: Optional[torch.Tensor]
+    suffix_kv_lens: Optional[torch.Tensor]
+
+    # Block info
+    total_cache_tokens: int
+    block_size: int
+    max_possible_sequence_length: int
+    num_reqs: int
+    physical_to_logical: torch.Tensor
+    decode_offset: torch.Tensor
+
+    # For logging.
+    num_input_tokens: int = 0  # Number of tokens including padding.
+
+    # Flex Metadata
+    num_blocks = 0
+    block_mask: Optional[BlockMask] = None
+    score_mod: Optional[_score_mod_signature] = None
+    mask_mod: Optional[_mask_mod_signature] = None
+    logical_mask_mod: _mask_mod_signature = causal_mask_mod
+
+    def get_mask_mod(self) -> _mask_mod_signature:
+        """Creates the mask_mod function for FlexAttention.
+
+        This function creates the combined mask mod function that handles:
+            1. The paged attention block mapping
+            2. The mapping from packed query sequences to logical query entries
+
+        It also by defaults adds the decoding offset to the query indices.
+        With this info we create the "logical" indices that are passed to
+        mask_mod functions. This allows mask mod functions to be agnostic to
+        layout of the query and key/value tensors.
+
+        TODO is_within_lower_bound: do sequences start on block_boundaries?
+        """
+        # Create a lookup mapping from query indices -> request number
+        request_lookup = _offsets_to_doc_ids_tensor(self.query_start_loc)
+
+        def final_mask_mod(
+            b: torch.Tensor,
+            h: torch.Tensor,
+            q_idx: torch.Tensor,
+            physical_kv_idx: torch.Tensor,
+        ) -> torch.Tensor:
+            # Map query indices to corresponding request indices
+            q_req = request_lookup[q_idx]
+
+            # Convert physical KV indices to logical indices
+            physical_kv_block = physical_kv_idx // self.block_size
+            physical_kv_offset = physical_kv_idx % self.block_size
+            logical_block_idx = self.physical_to_logical[q_req,
+                                                         physical_kv_block]
+            logical_kv_idx = logical_block_idx * self.block_size + physical_kv_offset  # noqa: E501
+
+            # Determine valid kv indices
+            live_block = logical_block_idx >= 0
+            within_upper_bound = logical_kv_idx < self.seq_lens[q_req]
+            within_lower_bound = logical_kv_idx >= 0
+
+            is_valid = live_block & within_upper_bound & within_lower_bound
+
+            # Convert physical query indices to logical indices
+            local_q_idx = q_idx - self.query_start_loc[q_req]
+            logical_q_idx = local_q_idx + self.decode_offset[q_req]
+
+            # Apply mask modification only for valid indices
+            return torch.where(
+                is_valid,
+                self.logical_mask_mod(b, h, logical_q_idx, logical_kv_idx),
+                False,
+            )
+
+        return final_mask_mod
+
+    def build_block_mask(self) -> BlockMask:
+        assert self.mask_mod is not None
+        return create_block_mask_compiled(
+            self.mask_mod,
+            None,
+            None,
+            self.num_actual_tokens,
+            self.total_cache_tokens,
+        )
+
+    def __post_init__(self):
+        assert self.use_cascade is False, "Not implemented yet."
+        assert self.common_prefix_len == 0, "Not implemented yet."
+        assert self.cu_prefix_query_lens is None, "Not implemented yet."
+        assert self.prefix_kv_lens is None, "Not implemented yet."
+        assert self.suffix_kv_lens is None, "Not implemented yet."
+        self.num_blocks = self.total_cache_tokens // self.block_size
+        self.mask_mod = self.get_mask_mod()
+        self.block_mask = self.build_block_mask()
+
+
+class FlexAttentionMetadataBuilder:
+
+    def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
+                 block_table: BlockTable):
+        model_config = runner.model_config
+
+        self.runner = runner
+        self.num_heads_q = model_config.get_num_attention_heads(
+            runner.parallel_config)
+        self.num_heads_kv = model_config.get_num_kv_heads(
+            runner.parallel_config)
+        self.headdim = model_config.get_head_size()
+        self.block_size = kv_cache_spec.block_size
+        self.kv_cache_spec = kv_cache_spec
+        self.block_table = block_table
+
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
+        return False
+
+    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata):
+        max_seq_len = self.runner.seq_lens_np[:num_reqs].max()
+        query_start_loc = common_attn_metadata.query_start_loc
+        seq_lens = common_attn_metadata.seq_lens
+
+        block_table = self.block_table
+        block_table_tensor = block_table.get_device_tensor()[:num_reqs]
+        block_table.slot_mapping[:num_actual_tokens].copy_(
+            block_table.slot_mapping_cpu[:num_actual_tokens],
+            non_blocking=True)
+        slot_mapping = block_table.slot_mapping[:num_actual_tokens]
+
+        use_cascade = common_prefix_len > 0
+        cu_prefix_query_lens = None
+        prefix_kv_lens = None
+        suffix_kv_lens = None
+        if use_cascade:
+            raise NotImplementedError("Not yet my friend")
+
+        block_size = self.kv_cache_spec.block_size
+        max_possible_seq_len = self.runner.model_config.max_model_len
+        total_cache_tokens = (self.runner.cache_config.num_gpu_blocks *
+                              block_size)
+
+        inverse_block_table = physical_to_logical_mapping(
+            block_table_tensor, self.runner.cache_config.num_gpu_blocks)
+
+        # Get the original offset tensor
+        offset_tensor = torch.tensor(
+            self.runner.input_batch.num_computed_tokens_cpu[:num_reqs]).to(
+                self.runner.device, non_blocking=True)
+
+        out = FlexAttentionMetadata(
+            num_actual_tokens=num_actual_tokens,
+            max_query_len=max_query_len,
+            query_start_loc=query_start_loc,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table=block_table_tensor,
+            slot_mapping=slot_mapping,
+            use_cascade=use_cascade,
+            common_prefix_len=common_prefix_len,
+            cu_prefix_query_lens=cu_prefix_query_lens,
+            prefix_kv_lens=prefix_kv_lens,
+            suffix_kv_lens=suffix_kv_lens,
+            block_size=block_size,
+            max_possible_sequence_length=max_possible_seq_len,
+            num_reqs=num_reqs,
+            physical_to_logical=inverse_block_table,
+            total_cache_tokens=total_cache_tokens,
+            decode_offset=offset_tensor,
+        )
+        return out
+
+    def use_cascade_attention(self, *args, **kwargs) -> bool:
+        return False
+
+
+class FlexAttentionImpl(AttentionImpl):
+    sliding_window: Optional[tuple[int, int]]
+    alibi_slopes: Optional[torch.Tensor]
+    logits_soft_cap: Optional[float]
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+    ) -> None:
+        if blocksparse_params is not None:
+            # TODO we should support this :think
+            raise ValueError(
+                "FlashAttention does not support block-sparse attention.")
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+
+        if alibi_slopes is not None:
+            raise NotImplementedError(
+                "FlexAttention does not support alibi slopes yet.")
+        else:
+            self.alibi_slopes = None
+        if sliding_window is not None:
+            raise NotImplementedError(
+                "FlexAttention does not support sliding window yet.")
+        else:
+            self.sliding_window = (-1, -1)
+        self.kv_cache_dtype = kv_cache_dtype
+        self.logits_soft_cap = logits_soft_cap
+        if self.logits_soft_cap is not None:
+            raise NotImplementedError(
+                "FlexAttention does not support logits soft cap yet.")
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError(
+                "FlexAttention does not support kv sharing yet.")
+
+        support_head_sizes = FlexAttentionBackend.get_supported_head_sizes()
+        if head_size not in support_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by FlashAttention. "
+                f"Supported head sizes are: {support_head_sizes}. "
+                "Set VLLM_USE_V1=0 to use another attention backend.")
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlexAttention does not support quantized kv-cache. Yet")
+
+    @staticmethod
+    def view_as_4d(tensor: torch.Tensor) -> torch.Tensor:
+        """View a 3d tensor as 4D."""
+        if tensor.ndim == 4:
+            return tensor
+        assert tensor.ndim == 3
+        return tensor[None, :, :, :]
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlexAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FLexAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert output is not None, "Output tensor must be provided."
+        enable_gqa = self.num_kv_heads != self.num_heads
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+            # query = self.view_as_4d(query).permute(0, 2, 1, 3)
+            # return torch.empty_like(query)
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+
+        key_cache, value_cache = kv_cache.unbind(0)
+
+        torch.ops._C_cache_ops.reshape_and_cache_flash(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            attn_metadata.slot_mapping,
+            self.kv_cache_dtype,
+            layer._k_scale,
+            layer._v_scale,
+        )
+
+        # View out the block_size dim
+        key_cache = key_cache.view(-1, self.num_kv_heads, self.head_size)
+        value_cache = value_cache.view(-1, self.num_kv_heads, self.head_size)
+        query, key_cache, value_cache = map(
+            lambda x: self.view_as_4d(x).permute(0, 2, 1, 3),
+            (query, key_cache, value_cache),
+        )
+        query = query[:, :, :num_actual_tokens, :]
+        # Doesn't work for now -> constraint violation
+        # torch._dynamo.try_mark_dynamic(query, 2)
+        out = flex_attention_compiled(
+            query,
+            key_cache,
+            value_cache,
+            attn_metadata.score_mod,
+            attn_metadata.block_mask,
+            self.scale,
+            enable_gqa=enable_gqa,
+            kernel_options={"FORCE_USE_FLEX_ATTENTION": True},
+        )
+
+        # Flex doesn't have an out variant today, rely on epilogue fusion
+        out = out.permute(0, 2, 1, 3).squeeze(0)
+        output[:num_actual_tokens, :, :].copy_(out)
+        return output
-- 
GitLab


From 122cdca5f653dcac56a63330091c837bb3e87066 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 7 Jun 2025 13:13:21 +0800
Subject: [PATCH 226/274] [Misc] refactor context extension (#19246)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 .../offline_inference/context_extension.py    | 81 ++++++++++++-------
 1 file changed, 51 insertions(+), 30 deletions(-)

diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py
index 8d7666418..df39e4c25 100644
--- a/examples/offline_inference/context_extension.py
+++ b/examples/offline_inference/context_extension.py
@@ -1,37 +1,51 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This script demonstrates how to extend the context length
+of a Qwen model using the YARN method (rope_scaling)
+and run a simple chat example.
+
+Usage:
+    python examples/offline_inference/context_extension.py
+"""
 
 from vllm import LLM, SamplingParams
 
-rope_theta = 1000000
-original_max_position_embeddings = 32768
-factor = 4.0
-
-# Use yarn to extend context
-hf_overrides = {
-    "rope_theta": rope_theta,
-    "rope_scaling": {
-        "rope_type": "yarn",
-        "factor": factor,
-        "original_max_position_embeddings": original_max_position_embeddings,
-    },
-    "max_model_len": int(original_max_position_embeddings * factor),
-}
-
-llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides)
-
-sampling_params = SamplingParams(
-    temperature=0.8,
-    top_p=0.95,
-    max_tokens=128,
-)
-
-conversation = [
-    {"role": "system", "content": "You are a helpful assistant"},
-    {"role": "user", "content": "Hello"},
-    {"role": "assistant", "content": "Hello! How can I assist you today?"},
-]
-outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
+
+def create_llm():
+    rope_theta = 1000000
+    original_max_position_embeddings = 32768
+    factor = 4.0
+
+    # Use yarn to extend context
+    hf_overrides = {
+        "rope_theta": rope_theta,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": factor,
+            "original_max_position_embeddings": original_max_position_embeddings,
+        },
+        "max_model_len": int(original_max_position_embeddings * factor),
+    }
+
+    llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides)
+    return llm
+
+
+def run_llm_chat(llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=128,
+    )
+
+    conversation = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello"},
+        {"role": "assistant", "content": "Hello! How can I assist you today?"},
+    ]
+    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
+    return outputs
 
 
 def print_outputs(outputs):
@@ -44,4 +58,11 @@ def print_outputs(outputs):
         print("-" * 80)
 
 
-print_outputs(outputs)
+def main():
+    llm = create_llm()
+    outputs = run_llm_chat(llm)
+    print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
-- 
GitLab


From d2f0e7e6158bf0c0995ebe09c4d95ee1b24c5607 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 7 Jun 2025 17:23:28 +0800
Subject: [PATCH 227/274] [CI/Build] Improve Llama GGUF test robustness
 (#19287)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 tests/models/quantization/test_gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py
index eafdfd1b0..32f9472c1 100644
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -38,7 +38,7 @@ class GGUFTestConfig(NamedTuple):
 LLAMA_CONFIG = GGUFTestConfig(
     original_model="meta-llama/Llama-3.2-1B-Instruct",
     gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
-    gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
+    gguf_filename="Llama-3.2-1B-Instruct-Q6_K.gguf",
 )
 
 QWEN2_CONFIG = GGUFTestConfig(
-- 
GitLab


From 4e4f63ad45e403613b7841ff1a0a1dd3b57fe836 Mon Sep 17 00:00:00 2001
From: Lifans <draftbks@gmail.com>
Date: Sat, 7 Jun 2025 03:25:38 -0700
Subject: [PATCH 228/274] [Nit][Benchmark]Fix example in
 benchmark_serving_structured_output.py (#19311)

Signed-off-by: Lifan Shen <lifans@meta.com>
---
 benchmarks/benchmark_serving_structured_output.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 3848ebda9..c1501ad52 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -12,7 +12,6 @@ On the client side, run:
         --model <your_model> \
         --dataset json \
         --structured-output-ratio 1.0 \
-        --structured-output-backend auto \
         --request-rate 10 \
         --num-prompts 1000
 
-- 
GitLab


From 88be823d572a773643c8326d13b8e4ee7bf8b1dd Mon Sep 17 00:00:00 2001
From: pramenku <7664080+pramenku@users.noreply.github.com>
Date: Sat, 7 Jun 2025 18:25:09 +0530
Subject: [PATCH 229/274] [AMD] Update compatible packaging version (#19309)

Signed-off-by: pramkuma <Pramendra.Kumar@amd.com>
---
 docker/Dockerfile.rocm | 2 +-
 requirements/rocm.txt  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 34641dd0a..4f40f32a3 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -13,7 +13,7 @@ RUN apt-get update -q -y && apt-get install -q -y \
     sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
     apt-transport-https ca-certificates wget curl
 # Remove sccache    
-RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
+RUN python3 -m pip install --upgrade pip
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
 ARG COMMON_WORKDIR
 WORKDIR ${COMMON_WORKDIR}
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index fb1febdac..d33021fc7 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -12,7 +12,8 @@ ray>=2.10.0,<2.45.0
 peft
 pytest-asyncio
 tensorizer>=2.9.0
-setuptools-scm>=8
+packaging>=24.2
 setuptools>=77.0.3,<80.0.0
+setuptools-scm>=8
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
-- 
GitLab


From 2d8476e465bb861c1e7d1e65c800b725282271c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Sat, 7 Jun 2025 13:34:51 -0400
Subject: [PATCH 230/274] [BugFix][V1] Fix memory profiling bug (#18974)

Signed-off-by: luka <luka@neuralmagic.com>
---
 tests/models/test_initialization.py |  2 ++
 tests/v1/sample/test_logprobs.py    | 13 ++++---
 vllm/v1/worker/gpu_worker.py        | 54 +++++++++++++++++++++++------
 3 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 98a58d01e..54e8cd597 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -86,6 +86,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
             } if model_info.speculative_model else None,
             trust_remote_code=model_info.trust_remote_code,
             max_model_len=model_info.max_model_len,
+            # these tests seem to produce leftover memory
+            gpu_memory_utilization=0.80,
             load_format="dummy",
             hf_overrides=hf_overrides,
         )
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 612eca116..69180e6e5 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -42,7 +42,7 @@ def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
             #TODO: enable this once we support it for
             # prompt logprobs.
             enable_prefix_caching=request.param,
-            gpu_memory_utilization=0.5,
+            gpu_memory_utilization=0.4,  # up to 2 alive concurrently
     ) as vllm_model:
         yield vllm_model
 
@@ -343,10 +343,13 @@ def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        runner = VllmRunner("facebook/opt-125m",
-                            max_logprobs=1,
-                            enable_prefix_caching=False,
-                            max_model_len=256)
+        runner = VllmRunner(
+            "facebook/opt-125m",
+            max_logprobs=1,
+            enable_prefix_caching=False,
+            # 2 other llms alive during whole session
+            gpu_memory_utilization=0.15,
+            max_model_len=256)
         vllm_sampling_params = SamplingParams(logprobs=1)
         # should pass
         runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 3bf3b2221..1dfccc9b3 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -130,7 +130,20 @@ class Worker(WorkerBase):
             _check_if_gpu_supports_dtype(self.model_config.dtype)
             gc.collect()
             torch.cuda.empty_cache()
-            self.init_gpu_memory = torch.cuda.mem_get_info()[0]
+            self.init_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+            requested_memory = (total_gpu_memory *
+                                self.cache_config.gpu_memory_utilization)
+            if self.init_gpu_memory < requested_memory:
+                GiB = lambda b: round(b / GiB_bytes, 2)
+                raise ValueError(
+                    f"Free memory on device ({GiB(self.init_gpu_memory)}/"
+                    f"{GiB(total_gpu_memory)} GiB) on startup is less than "
+                    f"desired GPU memory utilization "
+                    f"({self.cache_config.gpu_memory_utilization}, "
+                    f"{GiB(requested_memory)} GiB). Decrease GPU memory "
+                    f"utilization or reduce GPU memory used by other processes."
+                )
+
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
@@ -190,28 +203,47 @@ class Worker(WorkerBase):
         # GPU did not change their memory usage during the profiling.
         assert self.init_gpu_memory > free_gpu_memory, (
             "Error in memory profiling. "
-            f"Initial free memory {self.init_gpu_memory}, current free memory"
-            f" {free_gpu_memory}. This happens when the GPU memory was "
-            "not properly cleaned up before initializing the vLLM instance.")
+            f"Initial free memory {self.init_gpu_memory/GiB_bytes} GiB, "
+            f"current free memory {free_gpu_memory/GiB_bytes} GiB. "
+            f"This happens when the GPU memory was not properly cleaned up "
+            f"before initializing the vLLM instance.")
 
         # Get the peak memory allocation recorded by torch
-        peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
+        peak_torch_memory = torch.cuda.memory_stats(
+        )["allocated_bytes.all.peak"]
 
         # Check for any memory left around that may have been allocated on the
         # gpu outside of `torch`. NCCL operations, for example, can use a few
-        # GB during a forward pass
+        # GB during a forward pass.
         torch.cuda.empty_cache()
         torch_allocated_bytes = torch.cuda.memory_stats(
         )["allocated_bytes.all.current"]
-        total_allocated_bytes = torch.cuda.mem_get_info(
-        )[1] - torch.cuda.mem_get_info()[0]
-        non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
-        if non_torch_allocations > 0:
-            peak_memory += non_torch_allocations
+
+        # Reset after emptying torch cache
+        free_gpu_memory = torch.cuda.mem_get_info()[0]
+
+        # Total forward allocation (current) is equal to the diff in free memory
+        fwd_alloc_bytes = self.init_gpu_memory - free_gpu_memory
+        # We assume current non-torch allocation is equal to peak
+        non_torch_alloc_bytes = max(0, fwd_alloc_bytes - torch_allocated_bytes)
+        # Total forward allocation (peak) is peak torch + non-torch
+        peak_memory = peak_torch_memory + non_torch_alloc_bytes
+
         available_kv_cache_memory = (
             total_gpu_memory * self.cache_config.gpu_memory_utilization -
             peak_memory)
 
+        GiB = lambda b: b / GiB_bytes
+        logger.debug(
+            "Initial free memory: %.2f GiB, free memory: %.2f GiB, "
+            "total GPU memory: %.2f GiB", GiB(self.init_gpu_memory),
+            GiB(free_gpu_memory), GiB(total_gpu_memory))
+        logger.debug(
+            "Peak torch memory: %.2f GiB, non-torch forward-pass memory: "
+            "%.2f GiB, available KVCache memory: %.2f GiB",
+            GiB(peak_torch_memory), GiB(non_torch_alloc_bytes),
+            GiB(available_kv_cache_memory))
+
         return int(available_kv_cache_memory)
 
     def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
-- 
GitLab


From d77f7fb871473863df48ea71b4a8706028048f19 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Sun, 8 Jun 2025 08:16:31 +0800
Subject: [PATCH 231/274] [Bugfix]: Fix TypeError: 'float' object cannot be
 interpreted as an integer (#19283)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/v1/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 5b497e66c..192c90677 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -407,7 +407,7 @@ class CoreEngineActorManager:
             # For now, each DP rank can only be assigned to one node
             # TODO(rui): support allocating a single DP rank
             # to multiple nodes
-            available_engine_count = node_resources["GPU"] // world_size
+            available_engine_count = int(node_resources["GPU"]) // world_size
             if node_ip == dp_master_ip:
                 assert available_engine_count >= local_engine_count, (
                     "Not enough resources to allocate DP ranks "
-- 
GitLab


From eaa2e51088d4daf36d47e566ad90e812f80e91b8 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Sat, 7 Jun 2025 20:56:12 -0400
Subject: [PATCH 232/274] [Bugfix] Re-enable use_cudagraph in vLLM v1 (#19299)

Signed-off-by: Richard Zou <zou3519@gmail.com>
---
 tests/compile/piecewise/test_simple.py     |  2 +-
 tests/compile/piecewise/test_toy_llama.py  |  6 +--
 tests/compile/test_config.py               | 43 ++++++++++++++++++++++
 vllm/compilation/counter.py                |  2 +-
 vllm/compilation/cuda_piecewise_backend.py |  2 +-
 vllm/config.py                             |  5 ++-
 6 files changed, 52 insertions(+), 8 deletions(-)
 create mode 100644 tests/compile/test_config.py

diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 852aa44d4..a050646e5 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -95,7 +95,7 @@ def _test_simple_piecewise_compile(*, use_inductor):
             num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
             num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
             num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=
+            num_cudagraph_captured=
             6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
 
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 2464d7889..410c0101c 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -327,7 +327,7 @@ def _test_toy_llama(*, use_inductor):
             num_piecewise_graphs_seen=0,
             num_piecewise_capturable_graphs_seen=0,
             num_backend_compilations=0,
-            num_cudagraph_caputured=0,
+            num_cudagraph_captured=0,
     ):
         outputs.append(
             run_model(llama_config, use_inductor=False, use_compile=False))
@@ -343,7 +343,7 @@ def _test_toy_llama(*, use_inductor):
             num_piecewise_graphs_seen=1,
             num_piecewise_capturable_graphs_seen=1,
             num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=
+            num_cudagraph_captured=
             2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
             **kwargs,
     ):
@@ -361,7 +361,7 @@ def _test_toy_llama(*, use_inductor):
             llama_config.num_layers,  # 1 + num_layers
             num_backend_compilations=1 +
             llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=2 *
+            num_cudagraph_captured=2 *
         (1 + llama_config.num_layers
          ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
new file mode 100644
index 000000000..4f0d6fc5a
--- /dev/null
+++ b/tests/compile/test_config.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+import vllm
+from vllm.compilation.counter import compilation_counter
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
+
+from .piecewise.test_simple import SillyModel
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v1(monkeypatch):
+    """
+    TODO(rzou): The rest of tests/compile runs VLLM_USE_V1=0 right now,
+    I'll switch them over later.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '1')
+
+
+@pytest.mark.parametrize("enabled", [True, False])
+def test_use_cudagraphs(enabled):
+    assert vllm.envs.VLLM_USE_V1
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_cudagraph=enabled,
+        cudagraph_capture_sizes=[100],
+    ))
+    with set_current_vllm_config(vllm_config):
+        model = SillyModel(vllm_config=vllm_config, prefix='')
+
+    inputs = torch.randn(100, device="cuda")
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,  # one graph for the model
+            num_cudagraph_captured=1 if enabled else 0,
+    ):
+        # first run is warmup
+        model(inputs)
+        # second run does CUDAGraphs recording (if enabled)
+        model(inputs)
diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
index c584c103f..165347cfc 100644
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -15,7 +15,7 @@ class CompilationCounter:
     # not including the splitting ops
     num_piecewise_capturable_graphs_seen: int = 0
     num_backend_compilations: int = 0
-    num_cudagraph_caputured: int = 0
+    num_cudagraph_captured: int = 0
     # InductorAdapter.compile calls
     num_inductor_compiles: int = 0
     # EagerAdapter.compile calls
diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py
index 8bf957368..993def49a 100644
--- a/vllm/compilation/cuda_piecewise_backend.py
+++ b/vllm/compilation/cuda_piecewise_backend.py
@@ -193,7 +193,7 @@ class CUDAPiecewiseBackend:
             entry.output = weak_ref_tensors(output)
             entry.cudagraph = cudagraph
 
-            compilation_counter.num_cudagraph_caputured += 1
+            compilation_counter.num_cudagraph_captured += 1
 
             # important: we need to return the output, rather than
             # the weak ref of the output, so that pytorch can correctly
diff --git a/vllm/config.py b/vllm/config.py
index 31a1d208e..d643daa5c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3918,12 +3918,14 @@ class CompilationConfig:
     constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
 
     # CudaGraph compilation
-    use_cudagraph: bool = False
+    use_cudagraph: bool = envs.VLLM_USE_V1
     """Whether to use cudagraph inside compilation.
     - False: cudagraph inside compilation is not used.
     - True: cudagraph inside compilation is used. It requires
         that all input buffers have fixed addresses, and all
         splitting ops write their outputs to input buffers.
+    In the vLLM V1 Engine, this flag only applies for
+    CompilationLevel.PIECEWISE (aka -O3).
     Note that this is orthogonal to the cudagraph capture logic
     outside of compilation.
     TODO: move outside cudagraph logic into compilation.
@@ -4425,7 +4427,6 @@ class VllmConfig:
             # FIXME(rob): Add function to set all of these.
             if not self.compilation_config.custom_ops:
                 self.compilation_config.custom_ops = ["none"]
-            self.compilation_config.use_cudagraph = True
             self.compilation_config.cudagraph_num_of_warmups = 1
             self.compilation_config.pass_config.enable_fusion = False
             self.compilation_config.pass_config.enable_noop = False
-- 
GitLab


From 3d64d366e067a8121d923d7306fbcac0867411c6 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Sun, 8 Jun 2025 04:06:48 -0400
Subject: [PATCH 233/274] [Misc] Change tests/compile to use VLLM_V1 by default
 (#19302)

Signed-off-by: rzou <zou3519@gmail.com>
---
 tests/compile/conftest.py              | 15 ---------------
 tests/compile/piecewise/test_simple.py |  2 ++
 2 files changed, 2 insertions(+), 15 deletions(-)
 delete mode 100644 tests/compile/conftest.py

diff --git a/tests/compile/conftest.py b/tests/compile/conftest.py
deleted file mode 100644
index d86ca3710..000000000
--- a/tests/compile/conftest.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-# TEST V1: this should be removed. Right now V1 overrides
-# all the torch compile logic. We should re-enable this
-# as we add torch compile support back to V1.
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index a050646e5..9633f1398 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -13,6 +13,7 @@ from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
                          set_current_vllm_config)
+from vllm.envs import VLLM_USE_V1
 from vllm.utils import direct_register_custom_op
 
 global_counter = 0
@@ -76,6 +77,7 @@ class SillyModel(nn.Module):
 
 
 def _test_simple_piecewise_compile(*, use_inductor):
+    assert VLLM_USE_V1
 
     vllm_config = VllmConfig(compilation_config=CompilationConfig(
         level=CompilationLevel.PIECEWISE,
-- 
GitLab


From 989dcee981171098b61f58064965729274796e97 Mon Sep 17 00:00:00 2001
From: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com>
Date: Sun, 8 Jun 2025 16:07:02 +0800
Subject: [PATCH 234/274] Add H20-3e fused MoE kernel tuning configs for
 Qwen3-235B-A22B (#19315)

Signed-off-by: Xu Wenqing <xuwq1993@qq.com>
---
 ...E=128,N=192,device_name=NVIDIA_H20-3e.json | 146 ++++++++++++++++++
 ...E=128,N=384,device_name=NVIDIA_H20-3e.json | 146 ++++++++++++++++++
 2 files changed, 292 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json
new file mode 100644
index 000000000..b50682075
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json
new file mode 100644
index 000000000..b0139b9f2
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
-- 
GitLab


From b9a1791e2ca5f8c78ac28514f180cd306d684c17 Mon Sep 17 00:00:00 2001
From: Akash kaothalkar <61960177+Akashcodes732@users.noreply.github.com>
Date: Sun, 8 Jun 2025 14:47:14 +0530
Subject: [PATCH 235/274] [Hardware][POWER] Add IBM POWER11 Support to CPU
 Extension Detection (#19082)

Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Co-authored-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
---
 cmake/cpu_extension.cmake | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index fb763db9f..5cd2c98f2 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -75,6 +75,7 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
 else()
     find_isa(${CPUINFO} "avx2" AVX2_FOUND)
     find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
+    find_isa(${CPUINFO} "Power11" POWER11_FOUND)
     find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
     find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
     find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
@@ -106,13 +107,19 @@ elseif (AVX2_FOUND)
     list(APPEND CXX_COMPILE_FLAGS "-mavx2")
     message(WARNING "vLLM CPU backend using AVX2 ISA")
     
-elseif (POWER9_FOUND OR POWER10_FOUND)
+elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
     message(STATUS "PowerPC detected")
-    # Check for PowerPC VSX support
-    list(APPEND CXX_COMPILE_FLAGS
-        "-mvsx"
-        "-mcpu=native"
-        "-mtune=native")
+    if (POWER9_FOUND)
+        list(APPEND CXX_COMPILE_FLAGS
+            "-mvsx"
+            "-mcpu=power9"
+            "-mtune=power9")
+    elseif (POWER10_FOUND OR POWER11_FOUND)
+        list(APPEND CXX_COMPILE_FLAGS
+            "-mvsx"
+            "-mcpu=power10"
+            "-mtune=power10")
+    endif()
 
 elseif (ASIMD_FOUND)
     message(STATUS "ARMv8 or later architecture detected")
-- 
GitLab


From c123bc33f9c139b293e906d9612e247c8e29dbbf Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Sun, 8 Jun 2025 06:05:55 -0700
Subject: [PATCH 236/274] [Quantization] Add compressed-tensors NVFP4 support
 (#18312)

---
 tests/quantization/test_compressed_tensors.py |  19 +-
 .../compressed_tensors/compressed_tensors.py  |  31 ++-
 .../compressed_tensors/schemes/__init__.py    |   4 +-
 .../schemes/compressed_tensors_w4a4_nvfp4.py  | 178 ++++++++++++++++++
 .../quantization/compressed_tensors/utils.py  |   1 +
 .../utils/nvfp4_emulation_utils.py            |  50 ++++-
 6 files changed, 267 insertions(+), 16 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 2c07fe29f..d68aa22be 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -14,9 +14,10 @@ from compressed_tensors.quantization import QuantizationType
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensors24, CompressedTensorsLinearMethod,
-    CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+    CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     sparse_cutlass_supported)
 from vllm.platforms import current_platform
@@ -651,9 +652,13 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
         assert output
 
 
-def test_compressed_tensors_nvfp4a16(vllm_runner):
-    # run weight only example
-    model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16"
+@pytest.mark.parametrize(
+    "args",
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16",
+      CompressedTensorsW4A16Fp4),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4)])
+def test_compressed_tensors_nvfp4(vllm_runner, args):
+    model, scheme = args
     with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
@@ -662,7 +667,7 @@ def test_compressed_tensors_nvfp4a16(vllm_runner):
             qkv_proj = layer.self_attn.qkv_proj
             assert isinstance(qkv_proj.quant_method,
                               CompressedTensorsLinearMethod)
-            assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Fp4)
+            assert isinstance(qkv_proj.scheme, scheme)
             assert qkv_proj.scheme.group_size == 16
 
         llm.apply_model(check_model)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 1ee4617e1..28c62fc5e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -24,10 +24,10 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
     CompressedTensorsMoEMethod)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensors24,
-    CompressedTensorsScheme, CompressedTensorsW4A16Fp4,
-    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
-    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
-    CompressedTensorsWNA16)
+    CompressedTensorsScheme, CompressedTensorsW4A4Fp4,
+    CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
+    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     find_matched_target, is_activation_quantization_format,
     should_ignore_layer)
@@ -218,6 +218,26 @@ class CompressedTensorsConfig(QuantizationConfig):
         else:
             return False
 
+    def _is_fp4a4_nvfp4(self, weight_quant: BaseModel, input_quant: BaseModel):
+
+        if weight_quant is None or input_quant is None:
+            return False
+
+        is_tensor_group_quant = (weight_quant.strategy
+                                 == QuantizationStrategy.TENSOR_GROUP.value
+                                 and input_quant.strategy
+                                 == QuantizationStrategy.TENSOR_GROUP.value)
+        is_symmetric = weight_quant.symmetric and input_quant.symmetric
+
+        is_group_size_16 = (weight_quant.group_size == 16
+                            and input_quant.group_size == 16)
+        is_float_type = (weight_quant.type == QuantizationType.FLOAT
+                         and input_quant.type == QuantizationType.FLOAT.value)
+        is_4_bits = weight_quant.num_bits == 4 and input_quant.num_bits == 4
+
+        return (is_tensor_group_quant and is_float_type and is_4_bits
+                and is_group_size_16 and is_symmetric)
+
     def _is_fp4a16_nvfp4(self, weight_quant: BaseModel,
                          input_quant: BaseModel):
 
@@ -353,6 +373,9 @@ class CompressedTensorsConfig(QuantizationConfig):
                     actorder=weight_quant.actorder)
 
         if is_activation_quantization_format(self.quant_format):
+            if self._is_fp4a4_nvfp4(weight_quant, input_quant):
+                return CompressedTensorsW4A4Fp4()
+
             if self._is_fp8_w8a8(weight_quant, input_quant):
                 is_fp8_w8a8_supported = self._check_scheme_supported(
                     CompressedTensorsW8A8Fp8.get_min_capability(), error=False)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 25924c733..6e4e75df7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .compressed_tensors_scheme import CompressedTensorsScheme
+from .compressed_tensors_w4a4_nvfp4 import CompressedTensorsW4A4Fp4
 from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
                                           CompressedTensorsW4A16Sparse24)
 from .compressed_tensors_w4a16_nvfp4 import CompressedTensorsW4A16Fp4
@@ -18,5 +19,6 @@ __all__ = [
     "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24",
     "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8",
     "WNA16_SUPPORTED_BITS", "W4A16SPARSE24_SUPPORTED_BITS",
-    "CompressedTensors24", "CompressedTensorsW4A16Fp4"
+    "CompressedTensors24", "CompressedTensorsW4A16Fp4",
+    "CompressedTensorsW4A4Fp4"
 ]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
new file mode 100644
index 000000000..9899db324
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Callable, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm._custom_ops import (cutlass_scaled_fp4_mm,
+                              cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (  # noqa: E501
+    dequantize_to_dtype, ref_nvfp4_quant)
+from vllm.model_executor.parameter import (GroupQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+__all__ = ["CompressedTensorsW4A4Fp4"]
+
+
+def cutlass_fp4_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+    return cutlass_scaled_mm_supports_fp4(capability)
+
+
+class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
+
+    def __init__(self):
+        self.group_size = 16
+        self.cutlass_nvfp4_supported = cutlass_fp4_supported()
+        if not self.cutlass_nvfp4_supported:
+            logger.warning("Current platform does not support cutlass NVFP4."
+                           " Running emulations.")
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # dont restrict as emulations
+        return 80
+
+    def run_nvfp4_emulations(self, x: torch.Tensor, layer):
+        x_m, x_k = x.shape
+        output_dtype = x.dtype
+
+        # quantize input to (FP4 and interleaved block scale)
+        x_fp4, x_blockscale = ref_nvfp4_quant(x, layer.input_global_scale,
+                                              self.group_size)
+
+        # dequantize input
+        x_fp4 = x_fp4.reshape(x_m, x_k // self.group_size, self.group_size)
+        x_blockscale = x_blockscale.unsqueeze(-1) / layer.input_global_scale
+        x_dq = (x_fp4 * x_blockscale).reshape(x_m, x_k).to(output_dtype)
+        del x_fp4, x_blockscale
+
+        # dequantize weight
+        w_fp4 = layer.weight.data.view(torch.uint8)
+        w_blockscale = layer.weight_scale_swizzled.data
+        w_global_scale = layer.weight_global_scale
+        w_dq = dequantize_to_dtype(w_fp4, w_blockscale, w_global_scale,
+                                   output_dtype, x.device, self.group_size)
+
+        # matmul
+        out = torch.matmul(x_dq, w_dq.t())
+        del w_dq, x_dq
+        return out
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: list[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        # Weight
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition // 2,
+            dtype=torch.uint8),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+        layer.register_parameter("weight_packed", weight)
+
+        # Global Weight Scale
+        weight_global_scale = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader)
+        layer.register_parameter("weight_global_scale", weight_global_scale)
+
+        # Per Group Weight Scale
+        weight_scale = GroupQuantScaleParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition // self.group_size,
+            dtype=torch.float8_e4m3fn,
+        ),
+                                                input_dim=1,
+                                                output_dim=0,
+                                                weight_loader=weight_loader)
+
+        layer.register_parameter("weight_scale", weight_scale)
+
+        input_global_scale = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader)
+        layer.register_parameter("input_global_scale", input_global_scale)
+
+    def swizzle_blockscale(self, scale: torch.tensor):
+        assert (scale.dtype == torch.float8_e4m3fn)
+        # Pad and blockwise interleave weight_scale
+        scale_ndim = scale.ndim
+        if scale.ndim == 2:
+            scale = scale.unsqueeze(0)
+        assert scale.ndim == 3
+        B, M, K = scale.shape
+        round_up_multiple = lambda x, m: (x + m - 1) // m * m
+        M_padded = round_up_multiple(M, 128)
+        K_padded = round_up_multiple(K, 4)
+        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
+        padded_scale[:B, :M, :K] = scale
+        batches, rows, cols = padded_scale.shape
+        assert rows % 128 == 0
+        assert cols % 4 == 0
+        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
+                                            cols // 4, 4)
+        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
+        swizzled_scale = swizzled_scale.contiguous().cuda()
+        return (swizzled_scale.reshape(M, K)
+                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
+
+    def process_weights_after_loading(self, layer) -> None:
+
+        global_input_scale = layer.input_global_scale.max().to(torch.float32)
+        layer.input_global_scale = Parameter(global_input_scale,
+                                             requires_grad=False)
+
+        layer.weight_global_scale = Parameter(
+            layer.weight_global_scale.max().to(torch.float32),
+            requires_grad=False)
+
+        swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
+        layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
+                                                requires_grad=False)
+
+        # required by cutlass kernel; need Parameter, not ModelWeightParameter
+        layer.weight = Parameter(layer.weight_packed.data, requires_grad=False)
+
+        if self.cutlass_nvfp4_supported:
+            layer.alpha = Parameter(layer.input_global_scale *
+                                    layer.weight_global_scale,
+                                    requires_grad=False)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        if self.cutlass_nvfp4_supported:
+            output_dtype = x.dtype
+            output_shape = [x.shape[0], layer.weight.shape[0]]
+
+            # quantize BF16 or FP16 to (FP4 and interleaved block scale)
+            x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)
+
+            out = cutlass_scaled_fp4_mm(x_fp4, layer.weight, x_blockscale,
+                                        layer.weight_scale_swizzled,
+                                        1 / layer.alpha, output_dtype)
+            if bias is not None:
+                out = out + bias
+            return out.view(*output_shape)
+        return self.run_nvfp4_emulations(x, layer)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 402646498..099d8613f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -15,6 +15,7 @@ def is_activation_quantization_format(format: str) -> bool:
         CompressionFormat.naive_quantized.value,
         CompressionFormat.int_quantized.value,
         CompressionFormat.float_quantized.value,
+        CompressionFormat.nvfp4_pack_quantized.value
     ]
     return format in _ACTIVATION_QUANTIZATION_FORMATS
 
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
index 6e8e98d54..c4ef3ce24 100644
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
@@ -2,10 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
-__all__ = [
-    "break_fp4_bytes",
-    "dequantize_to_dtype",
-]
+from vllm.scalar_type import scalar_types
+
+__all__ = ["break_fp4_bytes", "dequantize_to_dtype", "ref_nvfp4_quant"]
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
 
 kE2M1ToFloat = torch.tensor([0., 0.5, 1., 1.5, 2., 3., 4., 6.],
                             dtype=torch.float32)
@@ -60,3 +61,44 @@ def dequantize_to_dtype(tensor_fp4,
     # scale the tensor
     out = (tensor_f32 * tensor_sf_dtype.unsqueeze(-1)).reshape(m, k)
     return out.to(dtype)
+
+
+def get_reciprocal(x):
+    if isinstance(x, torch.Tensor):
+        return torch.where(x == 0, torch.tensor(0.0, dtype=x.dtype), 1.0 / x)
+    elif isinstance(x, (float, int)):
+        return 0.0 if x == 0 else 1.0 / x
+    else:
+        raise TypeError("Input must be a float, int, or a torch.Tensor.")
+
+
+def cast_to_fp4(x):
+    sign = torch.sign(x)
+    x = torch.abs(x)
+    x[(x >= 0.0) & (x <= 0.25)] = 0.0
+    x[(x > 0.25) & (x < 0.75)] = 0.5
+    x[(x >= 0.75) & (x <= 1.25)] = 1.0
+    x[(x > 1.25) & (x < 1.75)] = 1.5
+    x[(x >= 1.75) & (x <= 2.5)] = 2.0
+    x[(x > 2.5) & (x < 3.5)] = 3.0
+    x[(x >= 3.5) & (x <= 5.0)] = 4.0
+    x[x > 5.0] = 6.0
+    return x * sign
+
+
+def ref_nvfp4_quant(x, global_scale, block_size):
+    assert global_scale.dtype == torch.float32
+    assert x.ndim == 2
+    m, n = x.shape
+    x = torch.reshape(x, (m, n // block_size, block_size))
+    vec_max = torch.max(torch.abs(x), dim=-1,
+                        keepdim=True)[0].to(torch.float32)
+    scale = global_scale * (vec_max * get_reciprocal(FLOAT4_E2M1_MAX))
+    scale = torch.clamp(scale, max=448, min=-448)
+    scale = scale.to(torch.float8_e4m3fn).to(torch.float32)
+    output_scale = get_reciprocal(scale * get_reciprocal(global_scale))
+
+    scaled_x = x.to(torch.float32) * output_scale
+    clipped_x = torch.clamp(scaled_x, -6.0, 6.0).reshape(m, n)
+    # both outputs are float32
+    return cast_to_fp4(clipped_x), scale.squeeze(-1)
-- 
GitLab


From cda10fa3e2bb69ea276d663e5369ba16ec42cebb Mon Sep 17 00:00:00 2001
From: jennyyyyzhen <47012288+jennyyyyzhen@users.noreply.github.com>
Date: Sun, 8 Jun 2025 06:39:12 -0700
Subject: [PATCH 237/274] [Multi Modal] Add an env var for message queue max
 chunk bytes  (#19242)

Signed-off-by: yZhen <yZhen@fb.com>
Co-authored-by: yZhen <yZhen@fb.com>
---
 vllm/envs.py                           | 7 +++++++
 vllm/v1/executor/multiproc_executor.py | 6 +++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index ffb630079..9d18a1389 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -123,6 +123,7 @@ if TYPE_CHECKING:
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
     VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
     VLLM_SLEEP_WHEN_IDLE: bool = False
+    VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
 
 
 def get_default_cache_root():
@@ -847,6 +848,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # latency penalty when a request eventually comes.
     "VLLM_SLEEP_WHEN_IDLE":
     lambda: bool(int(os.getenv("VLLM_SLEEP_WHEN_IDLE", "0"))),
+
+    # Control the max chunk bytes (in MB) for the rpc message queue.
+    # Object larger than this threshold will be broadcast to worker
+    # processes via zmq.
+    "VLLM_MQ_MAX_CHUNK_BYTES_MB":
+    lambda: int(os.getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")),
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 0bd7383b5..2148680d5 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -20,6 +20,7 @@ from typing import Any, Callable, Optional, Union, cast
 
 import cloudpickle
 
+import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.distributed import (destroy_distributed_environment,
                               destroy_model_parallel)
@@ -72,7 +73,10 @@ class MultiprocExecutor(Executor):
 
         # Initialize worker and set up message queues for SchedulerOutputs
         # and ModelRunnerOutputs
-        self.rpc_broadcast_mq = MessageQueue(self.world_size, self.world_size)
+        max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
+        self.rpc_broadcast_mq = MessageQueue(self.world_size,
+                                             self.world_size,
+                                             max_chunk_bytes=max_chunk_bytes)
         scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
 
         # Create workers
-- 
GitLab


From 2ffb9b6e079b73eb10a15bb795fa886b3587f85a Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Sun, 8 Jun 2025 22:17:53 +0800
Subject: [PATCH 238/274] [Bugfix] model_max_length should consider
 max_model_len in tokenizer_config (#19201)

---
 vllm/config.py                    | 15 ++++++++++++++-
 vllm/transformers_utils/config.py | 16 ++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index d643daa5c..d8318ab22 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -44,7 +44,8 @@ from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder,
-    try_get_generation_config, try_get_safetensors_metadata, uses_mrope)
+    try_get_generation_config, try_get_safetensors_metadata,
+    try_get_tokenizer_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
 from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
@@ -1427,6 +1428,18 @@ class ModelConfig:
             sliding_window_len=self.get_hf_config_sliding_window(),
             spec_target_max_model_len=self.spec_target_max_model_len,
             encoder_config=self.encoder_config)
+
+        tokenizer_config = try_get_tokenizer_config(
+            self.tokenizer,
+            trust_remote_code=self.trust_remote_code,
+            revision=self.tokenizer_revision)
+
+        if tokenizer_config is None:
+            return max_model_len
+
+        model_max_length = tokenizer_config.get("model_max_length",
+                                                max_model_len)
+        max_model_len = min(max_model_len, model_max_length)
         return max_model_len
 
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 9bc3b8e09..d66e296fd 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -23,6 +23,7 @@ from transformers.models.auto.image_processing_auto import (
     get_image_processor_config)
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
+from transformers.models.auto.tokenization_auto import get_tokenizer_config
 from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
 from vllm import envs
@@ -867,3 +868,18 @@ def try_get_safetensors_metadata(
                           "Error retrieving safetensors")
     except Exception:
         return None
+
+
+def try_get_tokenizer_config(
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    trust_remote_code: bool,
+    revision: Optional[str] = None,
+) -> Optional[dict[str, Any]]:
+    try:
+        return get_tokenizer_config(
+            pretrained_model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+        )
+    except Exception:
+        return None
-- 
GitLab


From e31ae3de366dcf1898069dd57eebcb2983b5ab1d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 9 Jun 2025 10:19:56 +0800
Subject: [PATCH 239/274] [Deprecation] Remove `inputs` arg fallback in Engine
 classes (#18799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/engine/async_llm_engine.py         | 107 ++----------------------
 vllm/engine/llm_engine.py               |  48 +----------
 vllm/engine/multiprocessing/__init__.py |  45 +---------
 vllm/engine/multiprocessing/client.py   |  89 ++------------------
 4 files changed, 21 insertions(+), 268 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 72020a8cc..3d7d28055 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -6,12 +6,10 @@ import copy
 import time
 import weakref
 from functools import partial
-from typing import (Any, AsyncGenerator, Callable, Coroutine, Dict, Iterable,
-                    List, Mapping, Optional, Set, Tuple, Type, Union, overload)
+from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
+                    Mapping, Optional, Set, Tuple, Type, Union)
 from weakref import ReferenceType
 
-from typing_extensions import deprecated
-
 import vllm.envs as envs
 from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, VllmConfig)
@@ -36,7 +34,7 @@ from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, deprecate_kwargs, weak_bind
+from vllm.utils import Device, weak_bind
 
 logger = init_logger(__name__)
 ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@@ -429,24 +427,6 @@ class _AsyncLLMEngine(LLMEngine):
         return await (
             self.get_tokenizer_group().get_lora_tokenizer_async(lora_request))
 
-    @overload
-    @deprecated("'inputs' will be renamed to 'prompt")
-    async def add_request_async(
-        self,
-        request_id: str,
-        *,
-        inputs: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
-    ) -> None:
-        ...
-
-    @overload
     async def add_request_async(
         self,
         request_id: str,
@@ -459,32 +439,10 @@ class _AsyncLLMEngine(LLMEngine):
         priority: int = 0,
         data_parallel_rank: Optional[int] = None,
     ) -> None:
-        ...
-
-    @deprecate_kwargs(
-        "inputs",
-        additional_message="Please use the 'prompt' parameter instead.",
-    )
-    async def add_request_async(
-            self,
-            request_id: str,
-            prompt: Optional[PromptType] = None,
-            params: Optional[Union[SamplingParams, PoolingParams]] = None,
-            arrival_time: Optional[float] = None,
-            lora_request: Optional[LoRARequest] = None,
-            trace_headers: Optional[Mapping[str, str]] = None,
-            prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-            priority: int = 0,
-            data_parallel_rank: Optional[int] = None,
-            *,
-            inputs: Optional[PromptType] = None,  # DEPRECATED
-    ) -> None:
-        """Async version of
-        [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]."""
-        if inputs is not None:
-            prompt = inputs
-        assert prompt is not None and params is not None
-
+        """
+        Async version of
+        [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request].
+        """
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                              "not enabled!")
@@ -521,8 +479,7 @@ class _AsyncLLMEngine(LLMEngine):
             params = await build_guided_decoding_logits_processor_async(
                 sampling_params=params,
                 tokenizer=await self.get_tokenizer_async(lora_request),
-                default_guided_backend=self.decoding_config.
-                guided_decoding_backend,
+                default_guided_backend=self.decoding_config.backend,
                 reasoning_backend=self.decoding_config.reasoning_backend,
                 model_config=self.model_config)
 
@@ -894,28 +851,7 @@ class AsyncLLMEngine(EngineClient):
                 raise
             await asyncio.sleep(0)
 
-    # This method does not need to be async, but kept that way
-    # for backwards compatibility.
-    @overload
-    @deprecated("'inputs' will be renamed to 'prompt")
-    def add_request(
-        self,
-        request_id: str,
-        *,
-        inputs: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
-    ) -> Coroutine[None, None, AsyncGenerator[Union[
-            RequestOutput, PoolingRequestOutput], None]]:
-        ...
-
-    @overload
-    def add_request(
+    async def add_request(
         self,
         request_id: str,
         prompt: PromptType,
@@ -926,32 +862,7 @@ class AsyncLLMEngine(EngineClient):
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
         data_parallel_rank: Optional[int] = None,
-    ) -> Coroutine[None, None, AsyncGenerator[Union[
-            RequestOutput, PoolingRequestOutput], None]]:
-        ...
-
-    @deprecate_kwargs(
-        "inputs",
-        additional_message="Please use the 'prompt' parameter instead.",
-    )
-    async def add_request(
-        self,
-        request_id: str,
-        prompt: Optional[PromptType] = None,
-        params: Optional[Union[SamplingParams, PoolingParams]] = None,
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
-        *,
-        inputs: Optional[PromptType] = None,  # DEPRECATED
     ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
-        if inputs is not None:
-            prompt = inputs
-        assert prompt is not None and params is not None
-
         if not self.is_running:
             if self.start_engine_loop:
                 self.start_background_loop()
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index dbcf78f02..8fccf9bd2 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -11,10 +11,10 @@ from functools import partial
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
                     Iterable, List, Literal, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
-from typing import Set, Type, Union, cast, overload
+from typing import Set, Type, Union, cast
 
 import torch
-from typing_extensions import TypeVar, deprecated
+from typing_extensions import TypeVar
 
 import vllm.envs as envs
 from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
@@ -58,8 +58,7 @@ from vllm.transformers_utils.tokenizer_group import (
     TokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
-from vllm.utils import (Counter, Device, deprecate_kwargs,
-                        resolve_obj_by_qualname, weak_bind)
+from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind
 from vllm.version import __version__ as VLLM_VERSION
 from vllm.worker.model_runner_base import InputProcessingError
 
@@ -629,7 +628,6 @@ class LLMEngine:
     def stop_remote_worker_execution_loop(self) -> None:
         self.model_executor.stop_remote_worker_execution_loop()
 
-    @overload
     def add_request(
         self,
         request_id: str,
@@ -641,42 +639,6 @@ class LLMEngine:
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> None:
-        ...
-
-    @overload
-    @deprecated("'inputs' will be renamed to 'prompt")
-    def add_request(
-        self,
-        request_id: str,
-        *,
-        inputs: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-    ) -> None:
-        ...
-
-    @deprecate_kwargs(
-        "inputs",
-        additional_message="Please use the 'prompt' parameter instead.",
-    )
-    def add_request(
-            self,
-            request_id: str,
-            prompt: Optional[PromptType] = None,
-            params: Optional[Union[SamplingParams, PoolingParams]] = None,
-            arrival_time: Optional[float] = None,
-            lora_request: Optional[LoRARequest] = None,
-            tokenization_kwargs: Optional[dict[str, Any]] = None,
-            trace_headers: Optional[Mapping[str, str]] = None,
-            prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-            priority: int = 0,
-            *,
-            inputs: Optional[PromptType] = None,  # DEPRECATED
     ) -> None:
         """Add a request to the engine's request pool.
 
@@ -725,10 +687,6 @@ class LLMEngine:
             >>> # continue the request processing
             >>> ...
         """
-        if inputs is not None:
-            prompt = inputs
-        assert prompt is not None and params is not None
-
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                              "not enabled!")
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index bf9f66903..db968cd6b 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -4,9 +4,7 @@
 import uuid
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import List, Mapping, Optional, Union, overload
-
-from typing_extensions import deprecated
+from typing import List, Mapping, Optional, Union
 
 from vllm import PoolingParams
 from vllm.inputs import PromptType
@@ -14,7 +12,7 @@ from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
-from vllm.utils import Device, deprecate_kwargs
+from vllm.utils import Device
 
 VLLM_RPC_SUCCESS_STR = "SUCCESS"
 
@@ -38,7 +36,6 @@ class RPCProcessRequest:
     prompt_adapter_request: Optional[PromptAdapterRequest] = None
     priority: int = 0
 
-    @overload
     def __init__(
         self,
         prompt: PromptType,
@@ -49,44 +46,6 @@ class RPCProcessRequest:
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> None:
-        ...
-
-    @overload
-    @deprecated("'inputs' will be renamed to 'prompt")
-    def __init__(
-        self,
-        *,
-        inputs: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-    ) -> None:
-        ...
-
-    @deprecate_kwargs(
-        "inputs",
-        additional_message="Please use the 'prompt' parameter instead.",
-    )
-    def __init__(
-            self,
-            prompt: Optional[PromptType] = None,
-            params: Optional[Union[SamplingParams, PoolingParams]] = None,
-            request_id: Optional[str] = None,
-            lora_request: Optional[LoRARequest] = None,
-            trace_headers: Optional[Mapping[str, str]] = None,
-            prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-            priority: int = 0,
-            *,
-            inputs: Optional[PromptType] = None,  # DEPRECATED
-    ) -> None:
-        if inputs is not None:
-            prompt = inputs
-        assert (prompt is not None and params is not None
-                and request_id is not None)
-
         super().__init__()
 
         self.prompt = prompt
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index f2f442485..9e018ec7f 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -6,13 +6,12 @@ import copy
 import pickle
 from contextlib import contextmanager, suppress
 from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping,
-                    Optional, Union, cast, overload)
+                    Optional, Union, cast)
 
 import cloudpickle
 import psutil
 import zmq
 import zmq.asyncio
-from typing_extensions import deprecated
 from zmq import Frame  # type: ignore[attr-defined]
 from zmq.asyncio import Socket
 
@@ -49,7 +48,7 @@ from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
-from vllm.utils import Device, deprecate_kwargs
+from vllm.utils import Device
 
 logger = init_logger(__name__)
 
@@ -442,7 +441,6 @@ class MQLLMEngineClient(EngineClient):
     def dead_error(self) -> BaseException:
         return ENGINE_DEAD_ERROR(self._errored_with)
 
-    @overload
     def generate(
         self,
         prompt: PromptType,
@@ -452,39 +450,6 @@ class MQLLMEngineClient(EngineClient):
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[RequestOutput, None]:
-        ...
-
-    @overload
-    @deprecated("'inputs' will be renamed to 'prompt")
-    def generate(
-        self,
-        *,
-        inputs: PromptType,
-        sampling_params: SamplingParams,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-    ) -> AsyncGenerator[RequestOutput, None]:
-        ...
-
-    @deprecate_kwargs(
-        "inputs",
-        additional_message="Please use the 'prompt' parameter instead.",
-    )
-    def generate(
-        self,
-        prompt: Optional[PromptType] = None,
-        sampling_params: Optional[SamplingParams] = None,
-        request_id: Optional[str] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-        *,
-        inputs: Optional[PromptType] = None  # DEPRECATED
     ) -> AsyncGenerator[RequestOutput, None]:
         """Generate outputs for a request.
 
@@ -506,16 +471,12 @@ class MQLLMEngineClient(EngineClient):
                 Any priority other than 0 will lead to an error if the
                 scheduling policy is not "priority".
         """
-        if inputs is not None:
-            prompt = inputs
-        assert (prompt is not None and sampling_params is not None
-                and request_id is not None)
-
-        return self._process_request(prompt, sampling_params, request_id,
-                                     lora_request, trace_headers,
-                                     prompt_adapter_request, priority)
+        return cast(
+            AsyncGenerator[RequestOutput, None],
+            self._process_request(prompt, sampling_params, request_id,
+                                  lora_request, trace_headers,
+                                  prompt_adapter_request, priority))
 
-    @overload
     def encode(
         self,
         prompt: PromptType,
@@ -524,37 +485,6 @@ class MQLLMEngineClient(EngineClient):
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        ...
-
-    @overload
-    @deprecated("'inputs' will be renamed to 'prompt")
-    def encode(
-        self,
-        *,
-        inputs: PromptType,
-        pooling_params: PoolingParams,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-    ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        ...
-
-    @deprecate_kwargs(
-        "inputs",
-        additional_message="Please use the 'prompt' parameter instead.",
-    )
-    def encode(
-        self,
-        prompt: Optional[PromptType] = None,
-        pooling_params: Optional[PoolingParams] = None,
-        request_id: Optional[str] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-        *,
-        inputs: Optional[PromptType] = None  # DEPRECATED
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
         """Generate outputs for a request from a pooling model.
 
@@ -575,11 +505,6 @@ class MQLLMEngineClient(EngineClient):
             The output `PoolingRequestOutput` objects from the LLMEngine
             for the request.
         """
-        if inputs is not None:
-            prompt = inputs
-        assert (prompt is not None and pooling_params is not None
-                and request_id is not None)
-
         return cast(
             AsyncGenerator[PoolingRequestOutput, None],
             self._process_request(prompt,
-- 
GitLab


From e1c4380d4c4ffc67ba802a5b6b6949763bd8fa71 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 9 Jun 2025 10:20:53 +0800
Subject: [PATCH 240/274] [Misc] Add documentation update reminder to PR
 template (#19289)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 .github/PULL_REQUEST_TEMPLATE.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index c1d1e07bf..017ec7ca8 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -2,6 +2,7 @@
 - [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
 - [ ] The test plan, such as providing test command.
 - [ ] The test results, such as pasting the results comparison before and after, or e2e results
+- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
 
 PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
 
@@ -11,5 +12,7 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B
 
 ## Test Result
 
+## (Optional) Documentation Update
+
 <!--- pyml disable-next-line no-emphasis-as-heading -->
 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
-- 
GitLab


From 8335667c22f1ae53923d85f810581c10df03a125 Mon Sep 17 00:00:00 2001
From: Kseniya Parkhamchuk <43078183+KsuParkhamchuk@users.noreply.github.com>
Date: Sun, 8 Jun 2025 21:22:10 -0500
Subject: [PATCH 241/274] [Frontend] Remove unreachable code from llm.py
 (#19288)

Signed-off-by: KsuParkhamchuk <k.parkhamchuk@gmail.com>
---
 vllm/entrypoints/llm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index fd28bf39e..7c9ca41dd 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -519,7 +519,6 @@ class LLM:
                       Sequence) and len(lora_request) != len(prompts):
             raise ValueError(
                 "Lora request list should be the same length as the prompts")
-            return lora_request
 
         if lora_request is None or isinstance(lora_request, LoRARequest):
             return [lora_request] * len(prompts)
-- 
GitLab


From 3a4d41770707f90d1d65e5c4a4c5cfb0f633dd2a Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Mon, 9 Jun 2025 03:05:44 -0400
Subject: [PATCH 242/274] [Misc] Cleanup compilation tests (#19343)

Signed-off-by: rzou <zou3519@gmail.com>
---
 tests/compile/test_config.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 4f0d6fc5a..57fd1f1c4 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -11,15 +11,6 @@ from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
 from .piecewise.test_simple import SillyModel
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v1(monkeypatch):
-    """
-    TODO(rzou): The rest of tests/compile runs VLLM_USE_V1=0 right now,
-    I'll switch them over later.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '1')
-
-
 @pytest.mark.parametrize("enabled", [True, False])
 def test_use_cudagraphs(enabled):
     assert vllm.envs.VLLM_USE_V1
-- 
GitLab


From 12e582922145bc7441e2f20e451dd161d4ffe828 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Mon, 9 Jun 2025 15:26:12 +0800
Subject: [PATCH 243/274] [doc] improve ci doc (#19307)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 .buildkite/scripts/ci-clean-log.sh | 17 +++++++++++++++++
 .buildkite/scripts/rerun-test.sh   | 18 ++++++++++++++++++
 docs/contributing/ci-failures.md   | 16 +++++++---------
 3 files changed, 42 insertions(+), 9 deletions(-)
 create mode 100644 .buildkite/scripts/ci-clean-log.sh
 create mode 100644 .buildkite/scripts/rerun-test.sh

diff --git a/.buildkite/scripts/ci-clean-log.sh b/.buildkite/scripts/ci-clean-log.sh
new file mode 100644
index 000000000..69d8a3a28
--- /dev/null
+++ b/.buildkite/scripts/ci-clean-log.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Usage: ./ci_clean_log.sh ci.log
+# This script strips timestamps and color codes from CI log files.
+
+# Check if argument is given
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 ci.log"
+    exit 1
+fi
+
+INPUT_FILE="$1"
+
+# Strip timestamps
+sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
+
+# Strip colorization
+sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
diff --git a/.buildkite/scripts/rerun-test.sh b/.buildkite/scripts/rerun-test.sh
new file mode 100644
index 000000000..d79c0d5f3
--- /dev/null
+++ b/.buildkite/scripts/rerun-test.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Usage: ./rerun_test.sh path/to/test.py::test_name
+
+# Check if argument is given
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 path/to/test.py::test_name"
+    echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
+    exit 1
+fi
+
+TEST=$1
+COUNT=1
+
+while pytest -sv "$TEST"; do
+    COUNT=$((COUNT + 1))
+    echo "RUN NUMBER ${COUNT}"
+done
diff --git a/docs/contributing/ci-failures.md b/docs/contributing/ci-failures.md
index 4d8f78197..7caaf10ce 100644
--- a/docs/contributing/ci-failures.md
+++ b/docs/contributing/ci-failures.md
@@ -64,15 +64,13 @@ Download the full log file from Buildkite locally.
 
 Strip timestamps and colorization:
 
-```bash
-# Strip timestamps
-sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' ci.log
+<gh-file:.buildkite/scripts/ci-clean-log.sh>
 
-# Strip colorization
-sed -i -r 's/\x1B\[[0-9;]*[mK]//g' ci.log
+```bash
+./ci-clean-log.sh ci.log
 ```
 
-Use a tool for quick copy-pasting:
+Use a tool [wl-clipboard](https://github.com/bugaevc/wl-clipboard) for quick copy-pasting:
 
 ```bash
 tail -525 ci_build.log | wl-copy
@@ -89,10 +87,10 @@ tail -525 ci_build.log | wl-copy
 
 CI test failures may be flaky. Use a bash loop to run repeatedly:
 
+<gh-file:.buildkite/scripts/rerun-test.sh>
+
 ```bash
-COUNT=1; while pytest -sv tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]; do  
-  COUNT=$[$COUNT + 1]; echo "RUN NUMBER ${COUNT}";  
-done
+./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]
 ```
 
 ## Submitting a PR
-- 
GitLab


From 0eca5eacd07061c5692250d0ef86267da831bd9f Mon Sep 17 00:00:00 2001
From: Se7en <chengzw258@163.com>
Date: Mon, 9 Jun 2025 17:30:02 +0800
Subject: [PATCH 244/274] [Doc] Fix description in the Automatic Prefix Caching
 design doc (#19333)

Signed-off-by: cr7258 <chengzw258@163.com>
---
 docs/design/v1/prefix_caching.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/design/v1/prefix_caching.md b/docs/design/v1/prefix_caching.md
index bbdfb2552..e87e4c6a4 100644
--- a/docs/design/v1/prefix_caching.md
+++ b/docs/design/v1/prefix_caching.md
@@ -144,7 +144,7 @@ As a result, we will have the following components when the KV cache manager is
 
 **Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation:
 
-1. The scheduler calls `kv_cache_manager.append_slots()`. It does the following steps:  
+1. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:  
    1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.  
    2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.  
    3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the Cache Block to cache it.
-- 
GitLab


From 95a6568b5c366359fabf519f7863021377cc0dd6 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 9 Jun 2025 17:52:10 +0800
Subject: [PATCH 245/274] [CI/Build] Fix LoRA test (#19350)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/conftest.py            |  5 -----
 tests/lora/test_llama_tp.py       | 34 -------------------------------
 tests/lora/test_lora_functions.py |  7 -------
 tests/lora/test_phi.py            | 10 +--------
 tests/lora/test_worker.py         | 18 ++++++++--------
 5 files changed, 11 insertions(+), 63 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 0737bb886..4908f9a06 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -164,11 +164,6 @@ def mixtral_lora_files():
     return snapshot_download(repo_id="SangBinCho/mixtral-lora")
 
 
-@pytest.fixture(scope="session")
-def gemma_lora_files():
-    return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
-
-
 @pytest.fixture(scope="session")
 def chatglm3_lora_files():
     return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 23819f03d..3ac3b80ec 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -4,9 +4,6 @@ import subprocess
 import sys
 from typing import Union
 
-import pytest
-import ray
-
 import vllm
 from vllm import LLM
 from vllm.lora.request import LoRARequest
@@ -121,37 +118,6 @@ def test_llama_lora(sql_lora_files):
     generate_and_test(llm, sql_lora_files)
 
 
-# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
-# used by the engine yet.
-@pytest.mark.skip_v1
-@create_new_process_for_each_test()
-def test_llama_lora_warmup(sql_lora_files):
-    """Test that the LLM initialization works with a warmup LORA path and
-    is more conservative"""
-
-    @ray.remote(num_gpus=1)
-    def get_num_gpu_blocks_lora():
-        llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
-        num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
-        return num_gpu_blocks_lora_warmup
-
-    @ray.remote(num_gpus=1)
-    def get_num_gpu_blocks_no_lora():
-        llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
-        num_gpu_blocks_no_lora_warmup = (
-            llm.llm_engine.cache_config.num_gpu_blocks)
-        return num_gpu_blocks_no_lora_warmup
-
-    num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
-    num_gpu_blocks_no_lora_warmup = ray.get(
-        get_num_gpu_blocks_no_lora.remote())
-    assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
-        "The warmup with lora should be more "
-        "conservative than without lora, therefore the number of "
-        "memory blocks for the KV cache should be "
-        "less when using lora than when not using lora")
-
-
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_llama_lora_tp4(sql_lora_files):
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
index e9a52e1b6..50c60341f 100644
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -15,13 +15,6 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
 LORA_RANK = 8
 
-# @pytest.fixture(autouse=True)
-# def v1(run_with_both_engines_lora):
-#     # Simple autouse wrapper to run both engines for each test
-#     # This can be promoted up to conftest.py to run for every
-#     # test in a package
-#     pass
-
 
 def make_lora_request(lora_id: int):
     return LoRARequest(lora_name=f"{lora_id}",
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index a21de0705..9d75512a2 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -11,14 +11,6 @@ MODEL_PATH = "microsoft/phi-2"
 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(
@@ -59,7 +51,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 
 # Skipping for V1 for now as we are hitting,
 # "Head size 80 is not supported by FlashAttention." error.
-@pytest.mark.skip_v1
+@pytest.mark.skip(reason="Head size 80 is not supported by FlashAttention")
 def test_phi2_lora(phi2_lora_files):
     # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
     # Otherwise, the lora-test will fail due to CUDA OOM.
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 6f13e663a..9999c1be5 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -16,6 +16,8 @@ from vllm.lora.request import LoRARequest
 from vllm.v1.worker.gpu_worker import Worker as V1Worker
 from vllm.worker.worker import Worker
 
+NUM_LORAS = 16
+
 
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
@@ -58,12 +60,12 @@ def test_worker_apply_lora(sql_lora_files):
         device_config=DeviceConfig("cuda"),
         cache_config=CacheConfig(
             block_size=16,
-            gpu_memory_utilization=1.0,
             swap_space=0,
             cache_dtype="auto",
         ),
-        lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
-                               max_loras=32),
+        lora_config=LoRAConfig(max_lora_rank=8,
+                               max_cpu_loras=NUM_LORAS,
+                               max_loras=NUM_LORAS),
     )
     worker = worker_cls(
         vllm_config=vllm_config,
@@ -78,9 +80,9 @@ def test_worker_apply_lora(sql_lora_files):
     set_active_loras(worker, [])
     assert worker.list_loras() == set()
 
-    n_loras = 32
     lora_requests = [
-        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
+        LoRARequest(str(i + 1), i + 1, sql_lora_files)
+        for i in range(NUM_LORAS)
     ]
 
     set_active_loras(worker, lora_requests)
@@ -89,12 +91,12 @@ def test_worker_apply_lora(sql_lora_files):
         for lora_request in lora_requests
     }
 
-    for i in range(32):
+    for i in range(NUM_LORAS):
         random.seed(i)
         iter_lora_requests = random.choices(lora_requests,
-                                            k=random.randint(1, n_loras))
+                                            k=random.randint(1, NUM_LORAS))
         random.shuffle(iter_lora_requests)
-        iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
+        iter_lora_requests = iter_lora_requests[:-random.randint(0, NUM_LORAS)]
         set_active_loras(worker, lora_requests)
         assert worker.list_loras().issuperset(
             {lora_request.lora_int_id
-- 
GitLab


From 59abbd84f90e5930c37e205de8849ac4fa8a96c7 Mon Sep 17 00:00:00 2001
From: Conroy Cheers <conroy@corncheese.org>
Date: Mon, 9 Jun 2025 19:57:23 +1000
Subject: [PATCH 246/274] [Fix] Allow kernel compilation for CUDA capability
 8.7 (#19328)

Signed-off-by: Conroy Cheers <conroy@corncheese.org>
---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index afaed7cd1..bd389823f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -308,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
 
     #
@@ -454,7 +454,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # kernels for the remaining archs that are not already built for 3x.
   # (Build 8.9 for FP8)
   cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
+    "7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
   if (SCALED_MM_2X_ARCHS)
@@ -684,7 +684,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
 
     #
-- 
GitLab


From 01810f92366ea69add8568c180701c21661ebd4f Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Mon, 9 Jun 2025 20:05:42 +0800
Subject: [PATCH 247/274] [CI] Introduce rules for llama auto-label (#19323)

Signed-off-by: Lu Fang <lufang@fb.com>
---
 .github/mergify.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index e595060c3..5692bb5d3 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -36,6 +36,20 @@ pull_request_rules:
       add:
         - frontend
 
+- name: label-llama
+  description: Automatically apply llama label
+  conditions:
+    - or:
+      - files~=^examples/.*llama.*\.py
+      - files~=^tests/.*llama.*\.py
+      - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
+      - files~=^vllm/model_executor/models/.*llama.*\.py
+      - files~=^vllm/transformers_utils/configs/.*llama.*\.py
+  actions:
+    label:
+      add:
+        - llama
+
 - name: label-multi-modality
   description: Automatically apply multi-modality label
   conditions:
-- 
GitLab


From c57c9415b17f014e5600b4b2cb6b7f0b70cfaa07 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Mon, 9 Jun 2025 21:28:51 +0800
Subject: [PATCH 248/274] [Docs] Fix a bullet list in usage/security.md
 (#19358)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/usage/security.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/usage/security.md b/docs/usage/security.md
index 1209cc8dd..76140434d 100644
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@@ -31,6 +31,7 @@ refer to the [PyTorch Security
 Guide](https://github.com/pytorch/pytorch/security/policy#using-distributed-features).
 
 Key points from the PyTorch security guide:
+
 - PyTorch Distributed features are intended for internal communication only
 - They are not built for use in untrusted environments or networks
 - No authorization protocol is included for performance reasons
-- 
GitLab


From 770e5dcdb8c2e8bc31171e614efc471d4a0c2442 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@thinkingmachines.ai>
Date: Mon, 9 Jun 2025 06:32:56 -0700
Subject: [PATCH 249/274] [full_graph] Fix query_start_loc padding (#19321)

Signed-off-by: Yinghai Lu <yinghai@thinkingmachines.ai>
---
 vllm/v1/worker/gpu_model_runner.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e3535ef14..c39aea3d7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -655,7 +655,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
         # Fill unused with -1. Needed for reshape_and_cache
         self.seq_lens[num_reqs:].fill_(0)
-        self.query_start_loc[num_reqs + 1:].fill_(-1)
+        # Note: pad query_start_loc to be non-decreasing, as kernels
+        # like FlashAttention requires that
+        self.query_start_loc[num_reqs + 1:].fill_(
+            self.query_start_loc_cpu[num_reqs].item())
 
         query_start_loc = self.query_start_loc[:num_reqs + 1]
         seq_lens = self.seq_lens[:num_reqs]
-- 
GitLab


From b8089195b450fb66b0336147854bdf2a669fc4f1 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 9 Jun 2025 22:10:44 +0800
Subject: [PATCH 250/274] [v1] Add fp32 support to v1 engine through flex attn
 (#19319)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../attention/test_attention_selector.py      | 28 +++++++++++++++++++
 tests/v1/worker/test_gpu_model_runner.py      |  6 ++++
 vllm/engine/arg_utils.py                      |  7 -----
 vllm/platforms/cuda.py                        |  4 +++
 4 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index f3e641557..ea5c8013a 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -183,6 +183,34 @@ def test_env(
                     assert backend.get_name() == expected
 
 
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+@pytest.mark.parametrize("use_v1", [True, False])
+def test_fp32_fallback(
+    device: str,
+    use_v1: bool,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Test attention backend selection with fp32."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+
+        if device == "cpu":
+            with patch("vllm.attention.selector.current_platform",
+                       CpuPlatform()):
+                backend = get_attn_backend(16, torch.float32, torch.float32,
+                                           16, False)
+            assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
+                    if use_v1 else "TORCH_SDPA")
+
+        elif device == "cuda":
+            with patch("vllm.attention.selector.current_platform",
+                       CudaPlatform()):
+                backend = get_attn_backend(16, torch.float32, torch.float32,
+                                           16, False)
+            assert (backend.get_name() == "FLEX_ATTENTION"
+                    if use_v1 else "XFORMERS")
+
+
 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
     """Test FlashAttn validation."""
     # TODO: When testing for v1, pipe in `use_v1` as an argument to
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index caacb1652..3d51b53df 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -4,6 +4,7 @@
 import random
 
 import pytest
+import torch
 
 from vllm.attention import Attention
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
@@ -399,6 +400,7 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
 
 
 def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
+    torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     error_msg = f"{layer_1} must come before the current layer"
@@ -427,6 +429,7 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
 
 
 def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
+    torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     invalid_layer = "model.layers.0.cross_attn.attn"
@@ -455,6 +458,7 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
 
 
 def test_init_kv_cache_with_kv_sharing_target_same_as_current():
+    torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     error_msg = f"{layer_1} cannot be the same as the current layer"
@@ -483,6 +487,7 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():
 
 
 def test_init_kv_cache_without_kv_sharing():
+    torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     vllm_config = get_vllm_config()
@@ -550,6 +555,7 @@ def test_init_kv_cache_without_kv_sharing():
 
 
 def test_init_kv_cache_with_kv_sharing_valid():
+    torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     vllm_config = get_vllm_config()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 4ce1b41e4..837c9b1b1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1337,13 +1337,6 @@ class EngineArgs:
                                recommend_to_remove=False)
             return False
 
-        # Only Fp16 and Bf16 dtypes since we only support FA.
-        V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
-        if model_config.dtype not in V1_SUPPORTED_DTYPES:
-            _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
-                               recommend_to_remove=False)
-            return False
-
         # No Embedding Models so far.
         if model_config.task not in ["generate"]:
             _raise_or_fallback(feature_name=f"--task {model_config.task}",
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 48d1aacba..20c06c5c9 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -233,6 +233,10 @@ class CudaPlatformBase(Platform):
                 logger.info_once("Using Triton backend on V1 engine.")
                 return ("vllm.v1.attention.backends."
                         "triton_attn.TritonAttentionBackend")
+            if dtype not in (torch.float16, torch.bfloat16):
+                logger.info_once(
+                    f"Using FlexAttenion backend for {dtype} on V1 engine.")
+                return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
             if cls.is_device_capability(100):
                 # Prefer FlashInfer for V1 on Blackwell GPUs if installed
                 try:
-- 
GitLab


From 5cf2daea9ada71232f87180d38bde2c2044a1a61 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 9 Jun 2025 10:50:39 -0400
Subject: [PATCH 251/274] [Misc] Fixes and Optimizations for DeepEP + DeepGEMM
 combination. (#19298)

Signed-off-by: Varun <vsundarr@redhat.com>
Co-authored-by: Varun <vsundarr@redhat.com>
---
 tests/kernels/moe/test_pplx_moe.py            |  2 +-
 .../device_communicators/all2all.py           | 15 +++------
 vllm/envs.py                                  |  5 +++
 .../layers/fused_moe/deep_gemm_moe.py         | 29 +++++++++-------
 .../fused_moe/deepep_ht_prepare_finalize.py   | 16 +++++----
 .../layers/fused_moe/moe_permute_unpermute.py |  2 +-
 .../layers/quantization/utils/fp8_utils.py    | 32 ++++++++++++++----
 vllm/v1/worker/gpu_model_runner.py            | 33 ++++++++++++++++++-
 8 files changed, 98 insertions(+), 36 deletions(-)

diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index bbfe31d0e..0b48bbef6 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -274,7 +274,7 @@ def pplx_prepare_finalize(pgi: ProcessGroupInfo, dp_size: int, a: torch.Tensor,
     chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
     chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
 
-    b_a, b_a_scale, expert_num_tokens = prepare_finalize.prepare(
+    b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
         a_chunk,
         None,
         None,
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index cab2496bf..35f2fd0ba 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -233,16 +233,11 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
         # Defaults for internode and intranode are taken from DeepEP tests.
         num_nvl_bytes = 1024 * 1024 * 1024
         num_qps_per_rank = num_local_experts
-        num_rdma_bytes = None
-
-        if self.internode:
-            num_rdma_bytes = 1024 * 1024 * 1024
-        else:
-            num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
-                num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
-                hidden=token_hidden_size,
-                num_ranks=num_ep_ranks,
-                num_experts=num_global_experts)
+        num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
+            num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
+            hidden=token_hidden_size,
+            num_ranks=num_ep_ranks,
+            num_experts=num_global_experts)
 
         assert num_rdma_bytes is not None
         return dict(group=self.cpu_group,
diff --git a/vllm/envs.py b/vllm/envs.py
index 9d18a1389..9511ed1cb 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -110,6 +110,7 @@ if TYPE_CHECKING:
     VLLM_DP_SIZE: int = 1
     VLLM_DP_MASTER_IP: str = ""
     VLLM_DP_MASTER_PORT: int = 0
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
     VLLM_V0_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
@@ -761,6 +762,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_DP_MASTER_PORT":
     lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
 
+    # Randomize inputs during dummy runs when using Data Parallel
+    "VLLM_RANDOMIZE_DP_DUMMY_INPUTS":
+    lambda: os.environ.get("VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0") == "1",
+
     # Whether to use S3 path for model loading in CI via RunAI Streamer
     "VLLM_CI_USE_S3":
     lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index c00e849b4..436c632be 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -80,11 +80,13 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk: int,
         num_experts: int,
     ) -> tuple[int, int, torch.dtype]:
+
         block_m = self.block_shape[0]
         M_sum = (M * topk) + num_experts * (block_m - 1)
         M_sum = round_up(M_sum, block_m)
         workspace1 = M_sum * max(N * 2, K)
-        workspace2 = M_sum * N
+        workspace2 = M_sum * max(N, K)
+
         return (workspace1, workspace2, a.dtype)
 
     def apply(
@@ -135,26 +137,31 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
         # Note: M_sum is different than the pre-permuted shape of a1q.
         M_sum = a1q.size(0)
-        workspace1 = _resize_cache(workspace13, (M_sum, N))
-        workspace2 = _resize_cache(workspace2, (M_sum, N // 2))
-        workspace3 = _resize_cache(workspace13, (M_sum, K))
+
+        mm1_out = _resize_cache(workspace13, (M_sum, N))
+        act_out = _resize_cache(workspace2, (M_sum, N // 2))
+        quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn),
+                                  (M_sum, N // 2))
+        mm2_out = _resize_cache(workspace2, (M_sum, K))
+        out = _resize_cache(workspace13, (inv_perm.size(0), K))
 
         dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
-            (a1q, a1q_scale), (w1, w1_scale), workspace1, expert_ids)
+            (a1q, a1q_scale), (w1, w1_scale), mm1_out, expert_ids)
 
-        self.activation(activation, workspace2, workspace1.view(-1, N))
+        self.activation(activation, act_out, mm1_out.view(-1, N))
 
         a2q_scale: Optional[torch.Tensor] = None
-        a2q, a2q_scale = per_token_group_quant_fp8(workspace2,
+        a2q, a2q_scale = per_token_group_quant_fp8(act_out,
                                                    self.block_shape[1],
-                                                   column_major_scales=True)
+                                                   column_major_scales=True,
+                                                   out_q=quant_out)
 
         dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
-            (a2q, a2q_scale), (w2, w2_scale), workspace3, expert_ids)
+            (a2q, a2q_scale), (w2, w2_scale), mm2_out, expert_ids)
 
-        workspace3 = workspace3[inv_perm, ...]
+        torch.index_select(mm2_out, 0, inv_perm, out=out)
 
-        return workspace3
+        return out
 
 
 def deep_gemm_moe_fp8(
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
index 48cf01638..8c21d8aa5 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -5,6 +5,7 @@ import deep_ep
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.utils import (
     moe_kernel_quantize_input)
 
@@ -193,20 +194,23 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                                   apply_router_weight_on_input: bool,
                                   output_dtype: torch.dtype):
 
+        hidden_dim = fused_expert_output.size(-1)
         if fused_expert_output.ndim == 2:
-            hidden_dim = fused_expert_output.size(-1)
             fused_expert_output = fused_expert_output.view(
                 num_tokens, -1, hidden_dim)
 
         if not apply_router_weight_on_input:
             # The DeepEP combine kernels don't do the topk weight
             # multiplication. We multiply the weights locally.
-            fused_expert_output = fused_expert_output.to(torch.float32)
-            fused_expert_output = fused_expert_output * topk_weights.view(
-                fused_expert_output.size(0), -1, 1)
-            fused_expert_output = fused_expert_output.to(output_dtype)
+            m_x_topk = fused_expert_output.size(0)
+            fused_expert_output.mul_(topk_weights.view(m_x_topk, -1, 1))
 
-        return fused_expert_output.sum(dim=1).to(output_dtype)
+        out = torch.empty((num_tokens, hidden_dim),
+                          device=fused_expert_output.device,
+                          dtype=output_dtype)
+        ops.moe_sum(fused_expert_output, out)
+
+        return out
 
     def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
                  topk_weights: torch.Tensor, topk_ids: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
index 89481e5bd..20ee0d9f7 100644
--- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
+++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@@ -18,7 +18,7 @@ def _moe_permute(
     expert_map: Optional[torch.Tensor],
     block_m: int,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
-           Optional[torch.Tensor]]:
+           torch.Tensor]:
     """
     Determine the sorted_token_ids, expert_ids for the given problem size.
     Permute the hidden states and scales according to `sorted_token_ids`.
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 270979c8e..08dc99e07 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -234,8 +234,13 @@ def _per_token_group_quant_fp8(
     row = g_id // groups_per_row
     row_g_id = g_id % groups_per_row
 
-    y_ptr += (row * y_row_stride) + (row_g_id * group_size)
-    y_q_ptr += g_id * group_size
+    # Ensure offset calculations use int64 to prevent overflow
+    y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) *
+                                                        group_size)
+    y_ptr += y_ptr_offset
+
+    y_q_ptr_offset = g_id.to(tl.int64) * group_size
+    y_q_ptr += y_q_ptr_offset
     y_s_ptr += g_id
 
     cols = tl.arange(0, BLOCK)  # N <= BLOCK
@@ -282,15 +287,23 @@ def _per_token_group_quant_fp8_colmajor(
     row = g_id // groups_per_row
     row_g_id = g_id % groups_per_row
 
-    y_ptr += (row * y_row_stride) + (row_g_id * group_size)
-    y_q_ptr += g_id * group_size
+    # Ensure offset calculations use int64 to prevent overflow
+    y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) *
+                                                        group_size)
+    y_ptr += y_ptr_offset
+
+    y_q_ptr_offset = g_id.to(tl.int64) * group_size
+    y_q_ptr += y_q_ptr_offset
 
     # Convert g_id the flattened block coordinate to 2D so we can index
     # into the output y_scales matrix
     blocks_per_row = y_num_columns // group_size
     scale_col = g_id % blocks_per_row
     scale_row = g_id // blocks_per_row
-    y_s_ptr += scale_col * y_s_col_stride + scale_row
+    # Ensure offset calculation uses int64 for y_s_ptr
+    y_s_ptr_offset = (scale_col.to(tl.int64) * y_s_col_stride) + scale_row.to(
+        tl.int64)
+    y_s_ptr += y_s_ptr_offset
 
     cols = tl.arange(0, BLOCK)  # group_size <= BLOCK
     mask = cols < group_size
@@ -311,6 +324,7 @@ def per_token_group_quant_fp8(
     eps: float = 1e-10,
     dtype: Optional[torch.dtype] = None,
     column_major_scales: bool = False,
+    out_q: Optional[torch.Tensor] = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Function to perform per-token-group quantization on an input tensor `x`.
     It converts the tensor values into signed float8 values and returns the
@@ -321,6 +335,8 @@ def per_token_group_quant_fp8(
         eps: The minimum to avoid dividing zero.
         dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn`
         is supported for now.
+        column_major_scales: Outputs scales in column major.
+        out_q: Optional output tensor. If not provided, function will create.
     Returns:
         tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
         scaling factor for quantization.
@@ -335,7 +351,11 @@ def per_token_group_quant_fp8(
     fp8_min = finfo.min
     fp8_max = finfo.max
 
-    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    assert out_q is None or out_q.shape == x.shape
+    x_q = out_q
+    if x_q is None:
+        x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+
     M = x.numel() // group_size
     N = group_size
     if column_major_scales:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c39aea3d7..175404efe 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -5,6 +5,7 @@ import copy
 import gc
 import time
 import weakref
+from contextlib import contextmanager
 from typing import TYPE_CHECKING, Any, Optional, Union
 
 import numpy as np
@@ -12,6 +13,7 @@ import torch
 import torch.distributed
 import torch.nn as nn
 
+import vllm.envs as envs
 from vllm.attention import AttentionType, get_attn_backend
 from vllm.attention.backends.abstract import (AttentionBackend,
                                               AttentionMetadataBuilder)
@@ -1727,6 +1729,35 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
         return prompt_logprobs_dict
 
+    @contextmanager
+    def maybe_randomize_inputs(self, input_ids: torch.Tensor):
+        """
+        Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set.
+        This is to help balance expert-selection
+         - during profile_run
+         - during DP rank dummy run 
+        """
+        dp_size = self.vllm_config.parallel_config.data_parallel_size
+        randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1
+        if not randomize_inputs:
+            yield
+        else:
+            import functools
+
+            @functools.cache
+            def rand_input_ids() -> torch.Tensor:
+                return torch.randint_like(
+                    self.input_ids,
+                    low=0,
+                    high=self.model_config.get_vocab_size(),
+                    dtype=input_ids.dtype)
+
+            logger.debug("Randomizing dummy data for DP Rank")
+            input_ids.copy_(rand_input_ids()[:input_ids.size(0)],
+                            non_blocking=True)
+            yield
+            input_ids.fill_(0)
+
     @torch.inference_mode()
     def _dummy_run(
         self,
@@ -1807,7 +1838,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 intermediate_tensors = self.sync_and_slice_intermediate_tensors(
                     num_tokens, None, False)
 
-            with set_forward_context(
+            with self.maybe_randomize_inputs(input_ids), set_forward_context(
                     attn_metadata,
                     self.vllm_config,
                     num_tokens=num_tokens,
-- 
GitLab


From c1c7dbbeeb6d4f0155d25b673f2063bfb14b37b9 Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Mon, 9 Jun 2025 08:01:29 -0700
Subject: [PATCH 252/274] [Bugfix][Core] Prevent token lengths exceeding
 `max_model_len` in V0 (#19348)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 tests/entrypoints/llm/test_generate.py       | 22 ++++++++++++++++++++
 vllm/engine/output_processor/stop_checker.py |  2 +-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
index 4676dc992..3c3281c34 100644
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -25,6 +25,12 @@ TOKEN_IDS = [
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
@@ -104,3 +110,19 @@ def test_multiple_sampling_params(llm: LLM):
     # sampling_params is None, default params should be applied
     outputs = llm.generate(PROMPTS, sampling_params=None)
     assert len(PROMPTS) == len(outputs)
+
+
+def test_max_model_len():
+    max_model_len = 20
+    llm = LLM(
+        model=MODEL_NAME,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=0.10,
+        enforce_eager=True,  # reduce test time
+    )
+    sampling_params = SamplingParams(max_tokens=max_model_len + 10)
+    outputs = llm.generate(PROMPTS, sampling_params)
+    for output in outputs:
+        num_total_tokens = len(output.prompt_token_ids) + len(
+            output.outputs[0].token_ids)
+        assert num_total_tokens == max_model_len
diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py
index 7925d91f6..3fb2f71b5 100644
--- a/vllm/engine/output_processor/stop_checker.py
+++ b/vllm/engine/output_processor/stop_checker.py
@@ -82,7 +82,7 @@ class StopChecker:
             return
 
         # Check if the sequence has reached max_model_len.
-        if seq.get_len() > self._get_max_model_len(lora_req):
+        if seq.get_len() >= self._get_max_model_len(lora_req):
             seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
             return
 
-- 
GitLab


From ebb2f383b817bf7ade6a36f4f247944262aaf223 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 9 Jun 2025 17:33:15 -0400
Subject: [PATCH 253/274] [Quantization] Bump compressed-tensors version
 (#19295)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 requirements/common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index a6a1ffe76..871df3d21 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -37,7 +37,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.10.0 # required for compressed-tensors
+compressed-tensors == 0.10.1 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
-- 
GitLab


From 31f58be96ae3e5dfc542604553ecead06cb5c487 Mon Sep 17 00:00:00 2001
From: liusiqian-tal <141730978+liusiqian-tal@users.noreply.github.com>
Date: Tue, 10 Jun 2025 05:41:21 +0800
Subject: [PATCH 254/274] [Frontend] Make TIMEOUT_KEEP_ALIVE configurable
 through env var (#18472)

Signed-off-by: liusiqian <liusiqian@tal.com>
---
 tests/async_engine/api_server_async_engine.py | 12 ++++++------
 vllm/entrypoints/api_server.py                |  4 ++--
 vllm/entrypoints/openai/api_server.py         |  4 +---
 vllm/envs.py                                  |  5 +++++
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
index 163185b90..ec6b20f5e 100644
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@@ -8,6 +8,7 @@ import uvicorn
 from fastapi.responses import JSONResponse, Response
 
 import vllm.entrypoints.api_server
+import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.utils import FlexibleArgumentParser
@@ -46,9 +47,8 @@ if __name__ == "__main__":
     engine_args = AsyncEngineArgs.from_cli_args(args)
     engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
     vllm.entrypoints.api_server.engine = engine
-    uvicorn.run(
-        app,
-        host=args.host,
-        port=args.port,
-        log_level="debug",
-        timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
+    uvicorn.run(app,
+                host=args.host,
+                port=args.port,
+                log_level="debug",
+                timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 56f8754c2..3d1e5dc14 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -17,6 +17,7 @@ from typing import Any, Optional
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 
+import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.launcher import serve_http
@@ -29,7 +30,6 @@ from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger("vllm.entrypoints.api_server")
 
-TIMEOUT_KEEP_ALIVE = 5  # seconds.
 app = FastAPI()
 engine = None
 
@@ -134,7 +134,7 @@ async def run_server(args: Namespace,
         host=args.host,
         port=args.port,
         log_level=args.log_level,
-        timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+        timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
         ssl_keyfile=args.ssl_keyfile,
         ssl_certfile=args.ssl_certfile,
         ssl_ca_certs=args.ssl_ca_certs,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2f8819bca..62f1c6a7c 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -103,8 +103,6 @@ from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
 from vllm.v1.metrics.prometheus import get_prometheus_registry
 from vllm.version import __version__ as VLLM_VERSION
 
-TIMEOUT_KEEP_ALIVE = 5  # seconds
-
 prometheus_multiproc_dir: tempfile.TemporaryDirectory
 
 # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
@@ -1360,7 +1358,7 @@ async def run_server_worker(listen_address,
             # NOTE: When the 'disable_uvicorn_access_log' value is True,
             # no access log will be output.
             access_log=not args.disable_uvicorn_access_log,
-            timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+            timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
             ssl_keyfile=args.ssl_keyfile,
             ssl_certfile=args.ssl_certfile,
             ssl_ca_certs=args.ssl_ca_certs,
diff --git a/vllm/envs.py b/vllm/envs.py
index 9511ed1cb..6f876d3df 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -71,6 +71,7 @@ if TYPE_CHECKING:
     VERBOSE: bool = False
     VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
     VLLM_RPC_TIMEOUT: int = 10000  # ms
+    VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
     VLLM_PLUGINS: Optional[list[str]] = None
     VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
     VLLM_TORCH_PROFILER_DIR: Optional[str] = None
@@ -557,6 +558,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_RPC_TIMEOUT":
     lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
 
+    # Timeout in seconds for keeping HTTP connections alive in API server
+    "VLLM_HTTP_TIMEOUT_KEEP_ALIVE":
+    lambda: int(os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")),
+
     # a list of plugin names to load, separated by commas.
     # if this is not set, it means all plugins will be loaded
     # if this is set to an empty string, no plugins will be loaded
-- 
GitLab


From 7d44c469fe3b59f931f9904bba46a7d0d53778fa Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Mon, 9 Jun 2025 15:38:15 -0700
Subject: [PATCH 255/274] [TPU]Fix KV cache sharing tests (#19371)

---
 tests/v1/tpu/worker/test_tpu_model_runner.py | 112 +++++++++----------
 1 file changed, 52 insertions(+), 60 deletions(-)

diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 73c0da45d..320ebef40 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import unittest.mock as mock
 
 import pytest
 
@@ -17,24 +16,8 @@ from vllm.v1.worker.tpu_model_runner import (
     TPUModelRunner, _get_padded_num_reqs_with_upper_limit,
     _get_padded_token_len, _get_req_paddings, _get_token_paddings)
 
-# Mock torch_xla module since it may not be available in the test environments
-torch_xla_patcher = mock.patch.dict(
-    "sys.modules", {
-        "torch_xla": mock.MagicMock(),
-        "torch_xla.core.xla_model": mock.MagicMock(),
-        "torch_xla.runtime": mock.MagicMock(),
-    })
-torch_xla_patcher.start()
 
-# Mock the PallasAttentionBackend
-pallas_attention_backend_patcher = mock.patch(
-    "vllm.v1.worker.tpu_model_runner.PallasAttentionBackend", )
-pallas_attention_backend_patcher.start()
-
-
-@pytest.fixture
-def model_runner():
-    # Patchers have already been started at module level.
+def get_vllm_config():
     scheduler_config = SchedulerConfig(
         max_num_seqs=10,
         max_num_batched_tokens=512,
@@ -60,18 +43,19 @@ def model_runner():
         cache_config=cache_config,
         scheduler_config=scheduler_config,
     )
+    return vllm_config
+
+
+def get_model_runner(vllm_config):
     device = "xla:0"  # Mocking TPU device
-    with mock.patch("vllm.v1.worker.tpu_model_runner.torch"), \
-         mock.patch("vllm.v1.worker.tpu_model_runner.xm"), \
-         mock.patch("vllm.v1.worker.tpu_model_runner.xr"):
-        return TPUModelRunner(vllm_config, device)
+    return TPUModelRunner(vllm_config, device)
 
 
-@pytest.fixture(autouse=True, scope="session")
-def cleanup_patches():
-    yield
-    torch_xla_patcher.stop()
-    pallas_attention_backend_patcher.stop()
+@pytest.fixture
+def model_runner():
+    # Patchers have already been started at module level.
+    vllm_config = get_vllm_config()
+    return get_model_runner(vllm_config)
 
 
 def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
@@ -370,12 +354,14 @@ def test_get_req_paddings():
     assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
 
 
-@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
-def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
+def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(
+        model_runner):
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     error_msg = f"{layer_1} must come before the current layer"
-    with pytest.raises(ValueError, match=error_msg):
+    vllm_config = model_runner.vllm_config
+    with pytest.raises(ValueError, match=error_msg), \
+        set_current_vllm_config(vllm_config):
         fwd_context = {
             # initialization below will fail because target layer is invalid;
             # the target layer needs to come before layer 1
@@ -399,13 +385,14 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
         assert fwd_context is not None
 
 
-@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
-def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
+def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner):
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     invalid_layer = "model.layers.0.cross_attn.attn"
     error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
-    with pytest.raises(ValueError, match=error_msg):
+    vllm_config = model_runner.vllm_config
+    with pytest.raises(ValueError, match=error_msg), \
+        set_current_vllm_config(vllm_config):
         fwd_context = {
             layer_0:
             Attention(
@@ -428,12 +415,13 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
         assert fwd_context is not None
 
 
-@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
-def test_init_kv_cache_with_kv_sharing_target_same_as_current():
+def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner):
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     error_msg = f"{layer_1} cannot be the same as the current layer"
-    with pytest.raises(ValueError, match=error_msg):
+    vllm_config = model_runner.vllm_config
+    with pytest.raises(ValueError, match=error_msg), \
+        set_current_vllm_config(vllm_config):
         fwd_context = {
             # initialization below will fail because target layer is invalid;
             # the target layer needs to come before layer 1
@@ -457,11 +445,10 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():
         assert fwd_context is not None
 
 
-@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
-def test_init_kv_cache_without_kv_sharing(model_runner):
+def test_init_kv_cache_without_kv_sharing():
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
-    vllm_config = model_runner.vllm_config
+    vllm_config = get_vllm_config()
     with set_current_vllm_config(vllm_config):
         fwd_context = {
             layer_0:
@@ -482,33 +469,38 @@ def test_init_kv_cache_without_kv_sharing(model_runner):
         # suppress var not used error
         assert fwd_context is not None
     # Set high context length to test max context length estimation
-    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_config.model_config.max_model_len = 1_000_000
     vllm_ctx = vllm_config.compilation_config.static_forward_context
+    model_runner = get_model_runner(vllm_config)
     kv_cache_spec = model_runner.get_kv_cache_spec()
     assert len(kv_cache_spec) == 2
     assert len(model_runner.shared_kv_cache_layers) == 0
 
     available_memory = 20 * GiB_bytes
-    # page size for layer 0's kv_cache_spec is 32KB
-    num_expected_blocks = 327680  # 20GB / 32KB / 2 (num layers)
+    # page size for each layer KV can be calculated as
+    # 2 (non-MLA) * 8 (num_heads) * 128 (head_dim)
+    # * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB
+    num_expected_blocks = 20480  # 20GB / 512KB / 2 (num layers)
     kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                           available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
-    assert len(kv_cache_config.tensors) == 2
-    assert kv_cache_config.tensors[layer_0].size == available_memory // 2
-    assert kv_cache_config.tensors[layer_1].size == available_memory // 2
+    assert len(kv_cache_config.kv_cache_tensors) == 2
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
+    assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2
 
     max_context_len =\
         estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
-    assert max_context_len == 1310720
+    # max_context_len = available_memory / (page_size / block_size) / num_caches
+    # max_context_len = 5GB / (512KB / 128) / 2 = 655360
+    assert max_context_len == 655360
 
     # important: override tensor size to prevent large mem alloc during test
-    # this will only allocate 2 block worth of memory (2 * 32kb)
+    # this will only allocate 2 block worth of memory (2 * 512kb)
     kv_cache_config.num_blocks = 1
-    for layer in kv_cache_config.tensors:
-        kv_cache_config.tensors[layer].size =\
-            kv_cache_spec[layer].page_size_bytes
+    for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+        kv_cache_tensor.size = (
+            kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes)
 
     model_runner.initialize_kv_cache(kv_cache_config)
 
@@ -524,11 +516,10 @@ def test_init_kv_cache_without_kv_sharing(model_runner):
     assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
 
 
-@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
-def test_init_kv_cache_with_kv_sharing_valid(model_runner):
+def test_init_kv_cache_with_kv_sharing_valid():
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
-    vllm_config = model_runner.vllm_config
+    vllm_config = get_vllm_config()
     with set_current_vllm_config(vllm_config):
         fwd_context = {
             layer_0:
@@ -552,33 +543,34 @@ def test_init_kv_cache_with_kv_sharing_valid(model_runner):
     # Set high context length to test max context length estimation
     vllm_config.model_config.max_model_len = 3_000_000
     vllm_ctx = vllm_config.compilation_config.static_forward_context
+    model_runner = get_model_runner(vllm_config)
     kv_cache_spec = model_runner.get_kv_cache_spec()
     assert len(kv_cache_spec) == 1
     assert layer_0 in kv_cache_spec
     assert model_runner.shared_kv_cache_layers[layer_1] == layer_0
 
     available_memory = 20 * GiB_bytes
-    # page size for layer 0's kv_cache_spec is 32KB
+    # page size for layer 0's kv_cache_spec is 512KB
     # with KV sharing, we can allocate (available_mem//page_size//1) blocks
     # which is twice as many as without KV sharing
-    num_expected_blocks = 655360  # 20GB / 32KB
+    num_expected_blocks = 2 * 20480  # 20GB / 512KB
     kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                           available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
-    assert len(kv_cache_config.tensors) == 1
+    assert len(kv_cache_config.kv_cache_tensors) == 1
     # Each layer now has twice the available memory for KV cache
     # compared to no KV sharing
-    assert kv_cache_config.tensors[layer_0].size == available_memory
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory
 
     max_context_len =\
         estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
-    assert max_context_len == 2 * 1310720
+    assert max_context_len == (2 * 655360)
 
     # important: override tensor size to prevent large mem alloc during test
-    # this will only allocate 1 block worth of memory (32kb)
+    # this will only allocate 1 block worth of memory (512kb)
     kv_cache_config.num_blocks = 1
-    kv_cache_config.tensors[layer_0].size =\
+    kv_cache_config.kv_cache_tensors[0].size =\
         kv_cache_spec[layer_0].page_size_bytes
 
     model_runner.initialize_kv_cache(kv_cache_config)
-- 
GitLab


From 8058c91108a3611c48ef0b54448ce6b48c017f5d Mon Sep 17 00:00:00 2001
From: Pavani Majety <pmajety@nvidia.com>
Date: Mon, 9 Jun 2025 16:00:07 -0700
Subject: [PATCH 256/274] [HOT-FIX] Add `kv_sharing_target_layer_name` argument
 to cutlass_mla backend (#19374)

Signed-off-by: Pavani Majety <pmajety@nvidia.com>
---
 vllm/v1/attention/backends/mla/cutlass_mla.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 70aee058e..c8ec57198 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -40,12 +40,13 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
             blocksparse_params: Optional[dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         unsupported_features = [
             alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
-- 
GitLab


From 3a7cd627a86c61a6dbc8d365c2e3c921b94e9971 Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Mon, 9 Jun 2025 16:41:51 -0700
Subject: [PATCH 257/274] [Misc] Fix a config typo in
 disable_hybrid_kv_cache_manager configuration (#19383)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
---
 vllm/config.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index d8318ab22..3fbb6015f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4497,13 +4497,13 @@ class VllmConfig:
             # warning message here and will log it later.
             if not (current_platform.is_cuda() or current_platform.is_rocm()):
                 # Hybrid KV cache manager is not supported on non-GPU platforms.
-                self.disable_hybrid_kv_cache_manager = True
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
             if self.kv_transfer_config is not None:
                 # Hybrid KV cache manager is not compatible with KV transfer.
-                self.disable_hybrid_kv_cache_manager = True
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
             if self.kv_events_config is not None:
                 # Hybrid KV cache manager is not compatible with KV events.
-                self.disable_hybrid_kv_cache_manager = True
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
 
     def update_sizes_for_sequence_parallelism(self,
                                               possible_sizes: list) -> list:
-- 
GitLab


From cc867be19c4c1480ce399bf95db4fd5791c91cbd Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Mon, 9 Jun 2025 17:40:01 -0700
Subject: [PATCH 258/274] [V1] Reuse V0's memory_profiling util for gpu worker
 memory profiling (#19312)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 vllm/utils.py                | 18 +++++++-
 vllm/v1/worker/gpu_worker.py | 86 +++++++++++++++---------------------
 2 files changed, 51 insertions(+), 53 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index c19c0221c..d8dd5f284 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2269,6 +2269,8 @@ def kill_process_tree(pid: int):
 class MemorySnapshot:
     """Memory snapshot."""
     torch_peak: int = 0
+    free_memory: int = 0
+    total_memory: int = 0
     cuda_memory: int = 0
     torch_memory: int = 0
     non_torch_memory: int = 0
@@ -2288,8 +2290,8 @@ class MemorySnapshot:
         self.torch_peak = torch.cuda.memory_stats().get(
             "allocated_bytes.all.peak", 0)
 
-        self.cuda_memory = torch.cuda.mem_get_info(
-        )[1] - torch.cuda.mem_get_info()[0]
+        self.free_memory, self.total_memory = torch.cuda.mem_get_info()
+        self.cuda_memory = self.total_memory - self.free_memory
 
         # torch.cuda.memory_reserved() is how many bytes
         # PyTorch gets from cuda (by calling cudaMalloc, etc.)
@@ -2302,6 +2304,8 @@ class MemorySnapshot:
     def __sub__(self, other: MemorySnapshot) -> MemorySnapshot:
         return MemorySnapshot(
             torch_peak=self.torch_peak - other.torch_peak,
+            free_memory=self.free_memory - other.free_memory,
+            total_memory=self.total_memory - other.total_memory,
             cuda_memory=self.cuda_memory - other.cuda_memory,
             torch_memory=self.torch_memory - other.torch_memory,
             non_torch_memory=self.non_torch_memory - other.non_torch_memory,
@@ -2323,6 +2327,16 @@ class MemoryProfilingResult:
     after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
     profile_time: float = 0.0
 
+    def __repr__(self) -> str:
+        return (f"Memory profiling takes {self.profile_time:.2f} seconds. "
+                f"Total non KV cache memory: "
+                f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; "
+                f"torch peak memory increase: "
+                f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; "
+                f"non-torch forward increase memory: "
+                f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; "
+                f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB.")
+
 
 @contextlib.contextmanager
 def memory_profiling(
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 1dfccc9b3..b7d244f27 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -22,7 +22,7 @@ from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import GiB_bytes
+from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.utils import report_usage_stats
@@ -130,20 +130,22 @@ class Worker(WorkerBase):
             _check_if_gpu_supports_dtype(self.model_config.dtype)
             gc.collect()
             torch.cuda.empty_cache()
-            self.init_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
-            requested_memory = (total_gpu_memory *
-                                self.cache_config.gpu_memory_utilization)
-            if self.init_gpu_memory < requested_memory:
+
+            # take current memory snapshot
+            self.init_snapshot = MemorySnapshot()
+            self.requested_memory = (self.init_snapshot.total_memory *
+                                     self.cache_config.gpu_memory_utilization)
+            if self.init_snapshot.free_memory < self.requested_memory:
                 GiB = lambda b: round(b / GiB_bytes, 2)
                 raise ValueError(
-                    f"Free memory on device ({GiB(self.init_gpu_memory)}/"
-                    f"{GiB(total_gpu_memory)} GiB) on startup is less than "
-                    f"desired GPU memory utilization "
+                    f"Free memory on device "
+                    f"({GiB(self.init_snapshot.free_memory)}/"
+                    f"{GiB(self.init_snapshot.total_memory)} GiB) on startup "
+                    f"is less than desired GPU memory utilization "
                     f"({self.cache_config.gpu_memory_utilization}, "
-                    f"{GiB(requested_memory)} GiB). Decrease GPU memory "
+                    f"{GiB(self.requested_memory)} GiB). Decrease GPU memory "
                     f"utilization or reduce GPU memory used by other processes."
                 )
-
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
@@ -192,57 +194,39 @@ class Worker(WorkerBase):
         """
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
+        GiB = lambda b: b / GiB_bytes
 
-        _, total_gpu_memory = torch.cuda.mem_get_info()
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
-        self.model_runner.profile_run()
+        with memory_profiling(
+                self.init_snapshot,
+                weights_memory=int(
+                    self.model_runner.model_memory_usage)) as profile_result:
+            self.model_runner.profile_run()
 
-        free_gpu_memory, _ = torch.cuda.mem_get_info()
+        free_gpu_memory = profile_result.after_profile.free_memory
         # NOTE(woosuk): Here we assume that the other processes using the same
         # GPU did not change their memory usage during the profiling.
-        assert self.init_gpu_memory > free_gpu_memory, (
+        assert self.init_snapshot.free_memory > free_gpu_memory, (
             "Error in memory profiling. "
-            f"Initial free memory {self.init_gpu_memory/GiB_bytes} GiB, "
-            f"current free memory {free_gpu_memory/GiB_bytes} GiB. "
-            f"This happens when the GPU memory was not properly cleaned up "
-            f"before initializing the vLLM instance.")
-
-        # Get the peak memory allocation recorded by torch
-        peak_torch_memory = torch.cuda.memory_stats(
-        )["allocated_bytes.all.peak"]
-
-        # Check for any memory left around that may have been allocated on the
-        # gpu outside of `torch`. NCCL operations, for example, can use a few
-        # GB during a forward pass.
-        torch.cuda.empty_cache()
-        torch_allocated_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
-
-        # Reset after emptying torch cache
-        free_gpu_memory = torch.cuda.mem_get_info()[0]
+            f"Initial free memory {GiB(self.init_snapshot.free_memory)} GiB, "
+            f"current free memory {GiB(free_gpu_memory)} GiB. "
+            "This happens when other processes sharing the same container "
+            "release GPU memory while vLLM is profiling during initialization. "
+            "To fix this, ensure consistent GPU memory allocation or "
+            "isolate vLLM in its own container.")
+        available_kv_cache_memory = self.requested_memory \
+            - profile_result.non_kv_cache_memory
 
-        # Total forward allocation (current) is equal to the diff in free memory
-        fwd_alloc_bytes = self.init_gpu_memory - free_gpu_memory
-        # We assume current non-torch allocation is equal to peak
-        non_torch_alloc_bytes = max(0, fwd_alloc_bytes - torch_allocated_bytes)
-        # Total forward allocation (peak) is peak torch + non-torch
-        peak_memory = peak_torch_memory + non_torch_alloc_bytes
-
-        available_kv_cache_memory = (
-            total_gpu_memory * self.cache_config.gpu_memory_utilization -
-            peak_memory)
-
-        GiB = lambda b: b / GiB_bytes
         logger.debug(
             "Initial free memory: %.2f GiB, free memory: %.2f GiB, "
-            "total GPU memory: %.2f GiB", GiB(self.init_gpu_memory),
-            GiB(free_gpu_memory), GiB(total_gpu_memory))
-        logger.debug(
-            "Peak torch memory: %.2f GiB, non-torch forward-pass memory: "
-            "%.2f GiB, available KVCache memory: %.2f GiB",
-            GiB(peak_torch_memory), GiB(non_torch_alloc_bytes),
-            GiB(available_kv_cache_memory))
+            "requested GPU memory: %.2f GiB",
+            GiB(self.init_snapshot.free_memory), GiB(free_gpu_memory),
+            GiB(self.requested_memory))
+        logger.debug(profile_result)
+        logger.info("Available KV cache memory: %.2f GiB",
+                    GiB(available_kv_cache_memory))
+        gc.collect()
 
         return int(available_kv_cache_memory)
 
-- 
GitLab


From 4589b940326d1480effc66ee33d30404ca31ed40 Mon Sep 17 00:00:00 2001
From: Tianyu Guo <guoty9@mail2.sysu.edu.cn>
Date: Tue, 10 Jun 2025 09:04:36 +0800
Subject: [PATCH 259/274] [Bugfix] Fix benchmark_moe.py (#19016)

Signed-off-by: Tianyu Guo <guoty9@mail2.sysu.edu.cn>
---
 benchmarks/kernels/benchmark_moe.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 6cb55b359..cef53b183 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -7,7 +7,6 @@ import time
 from contextlib import nullcontext
 from datetime import datetime
 from itertools import product
-from types import SimpleNamespace
 from typing import Any, TypedDict
 
 import ray
@@ -43,7 +42,7 @@ def benchmark_config(
     use_fp8_w8a8: bool,
     use_int8_w8a16: bool,
     num_iters: int = 100,
-    block_quant_shape: List[int] = None,
+    block_quant_shape: list[int] = None,
     use_deep_gemm: bool = False,
 ) -> float:
     init_dtype = torch.float16 if use_fp8_w8a8 else dtype
@@ -400,7 +399,7 @@ class BenchmarkWorker:
         dtype: torch.dtype,
         use_fp8_w8a8: bool,
         use_int8_w8a16: bool,
-        block_quant_shape: List[int] = None,
+        block_quant_shape: list[int] = None,
         use_deep_gemm: bool = False,
     ) -> tuple[dict[str, int], float]:
         current_platform.seed_everything(self.seed)
@@ -532,7 +531,7 @@ def save_configs(
     dtype: torch.dtype,
     use_fp8_w8a8: bool,
     use_int8_w8a16: bool,
-    block_quant_shape: List[int],
+    block_quant_shape: list[int],
 ) -> None:
     dtype_str = get_config_dtype_str(
         dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@@ -563,7 +562,6 @@ def main(args: argparse.Namespace):
     config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
     if args.model_prefix:
         config = getattr(config, args.model_prefix)
-    config = SimpleNamespace(**config)
 
     if config.architectures[0] == "DbrxForCausalLM":
         E = config.ffn_config.moe_num_experts
@@ -595,11 +593,7 @@ def main(args: argparse.Namespace):
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
 
     hidden_size = config.hidden_size
-    dtype = (
-        torch.float16
-        if current_platform.is_rocm()
-        else getattr(torch, config.torch_dtype)
-    )
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
     use_int8_w8a16 = args.dtype == "int8_w8a16"
     block_quant_shape = get_weight_block_size_safety(config)
-- 
GitLab


From 9af6d22e4c8ef79972911efaab2a68f4b03def2a Mon Sep 17 00:00:00 2001
From: XiongfeiWei <isaacwxf23@gmail.com>
Date: Mon, 9 Jun 2025 18:28:45 -0700
Subject: [PATCH 260/274] Use xla flag to improve the quantized model
 performance (#19303)

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
---
 vllm/v1/worker/tpu_worker.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 16a9f0959..5da481bae 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -101,7 +101,10 @@ class TPUWorker:
         # fix this. It will be removed after the bug in XLA compiler is fixed.
         os.environ["LIBTPU_INIT_ARGS"] = (
             os.environ.get("LIBTPU_INIT_ARGS", "") +
-            " --xla_tpu_force_1d_allreduce_at_chunk_count=1")
+            " --xla_tpu_force_1d_allreduce_at_chunk_count=1"
+            " --xla_jf_conv_input_fusion=False")
+        # --xla_jf_conv_input_fusion=False is used to improve the perf of
+        # quantized matmul.
         torch.set_grad_enabled(False)
         torch.set_default_dtype(self.model_config.dtype)
 
-- 
GitLab


From c016047ed7bc5b634384f91c187d90d0bdcb87e9 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 10 Jun 2025 05:36:54 +0100
Subject: [PATCH 261/274] Fix docs/mkdocs/hooks/remove_announcement.py (#19382)

---
 docs/mkdocs/hooks/remove_announcement.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/mkdocs/hooks/remove_announcement.py b/docs/mkdocs/hooks/remove_announcement.py
index f67941d2a..1a84039ab 100644
--- a/docs/mkdocs/hooks/remove_announcement.py
+++ b/docs/mkdocs/hooks/remove_announcement.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
+from pathlib import Path
 from typing import Literal
 
 
@@ -8,10 +9,9 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
     # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
     if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag":
         # remove the warning banner if the version is a tagged release
-        docs_dir = os.path.dirname(__file__)
-        announcement_path = os.path.join(docs_dir,
-                                         "mkdocs/overrides/main.html")
+        mkdocs_dir = Path(__file__).parent.parent
+        announcement_path = mkdocs_dir / "overrides/main.html"
         # The file might be removed already if the build is triggered multiple
         # times (readthedocs build both HTML and PDF versions separately)
-        if os.path.exists(announcement_path):
+        if announcement_path.exists():
             os.remove(announcement_path)
-- 
GitLab


From 6cd4ae8acdaacdfc89d2e025b24ba1f1927af260 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Tue, 10 Jun 2025 12:55:09 +0800
Subject: [PATCH 262/274] [Frontend] Add tqdm_leave_pbar to control progress
 bar visibility (#19357)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 vllm/entrypoints/llm.py | 82 ++++++++++++++++++++++++++---------------
 1 file changed, 52 insertions(+), 30 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 7c9ca41dd..6e3cb18fc 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -281,7 +281,7 @@ class LLM:
         sampling_params: Optional[Union[SamplingParams,
                                         Sequence[SamplingParams]]] = None,
         *,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -297,7 +297,7 @@ class LLM:
         sampling_params: Optional[Union[SamplingParams,
                                         list[SamplingParams]]] = None,
         prompt_token_ids: Optional[list[int]] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -313,7 +313,7 @@ class LLM:
         sampling_params: Optional[Union[SamplingParams,
                                         list[SamplingParams]]] = None,
         prompt_token_ids: Optional[list[list[int]]] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -330,7 +330,7 @@ class LLM:
                                         list[SamplingParams]]] = None,
         *,
         prompt_token_ids: list[int],
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -347,7 +347,7 @@ class LLM:
                                         list[SamplingParams]]] = None,
         *,
         prompt_token_ids: list[list[int]],
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -362,7 +362,7 @@ class LLM:
         prompts: None,
         sampling_params: None,
         prompt_token_ids: Union[list[int], list[list[int]]],
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -382,7 +382,7 @@ class LLM:
         sampling_params: Optional[Union[SamplingParams,
                                         Sequence[SamplingParams]]] = None,
         prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -404,7 +404,10 @@ class LLM:
                 When it is a single value, it is applied to every prompt.
                 When it is a list, the list must have the same length as the
                 prompts and it is paired one by one with the prompt.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
                 generation, if any.
@@ -678,7 +681,7 @@ class LLM:
                         list[list[ChatCompletionMessageParam]]],
         sampling_params: Optional[Union[SamplingParams,
                                         list[SamplingParams]]] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[LoRARequest] = None,
         chat_template: Optional[str] = None,
         chat_template_content_format: ChatTemplateContentFormatOption = "auto",
@@ -709,7 +712,10 @@ class LLM:
                 is a single value, it is applied to every prompt. When it
                 is a list, the list must have the same length as the
                 prompts and it is paired one by one with the prompt.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             chat_template: The template to use for structuring the chat.
                 If not provided, the model's default chat template will be used.
@@ -823,7 +829,7 @@ class LLM:
                                        Sequence[PoolingParams]]] = None,
         *,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -838,7 +844,7 @@ class LLM:
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[list[int]] = None,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -853,7 +859,7 @@ class LLM:
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[list[list[int]]] = None,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -869,7 +875,7 @@ class LLM:
         *,
         prompt_token_ids: list[int],
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -885,7 +891,7 @@ class LLM:
         *,
         prompt_token_ids: list[list[int]],
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -899,7 +905,7 @@ class LLM:
         pooling_params: None,
         prompt_token_ids: Union[list[int], list[list[int]]],
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -918,7 +924,7 @@ class LLM:
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -935,7 +941,10 @@ class LLM:
                 for more details about the format of each prompts.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
                 generation, if any.
@@ -1005,7 +1014,7 @@ class LLM:
         /,
         *,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
@@ -1024,7 +1033,10 @@ class LLM:
                 for more details about the format of each prompts.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
                 generation, if any.
@@ -1051,7 +1063,7 @@ class LLM:
         prompts: Union[PromptType, Sequence[PromptType]],
         /,
         *,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[ClassificationRequestOutput]:
@@ -1066,7 +1078,10 @@ class LLM:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
                 for batch inference. See [PromptType][vllm.inputs.PromptType]
                 for more details about the format of each prompts.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
                 generation, if any.
@@ -1092,7 +1107,7 @@ class LLM:
         text_1: list[Union[str, TextPrompt, TokensPrompt]],
         text_2: list[Union[str, TextPrompt, TokensPrompt]],
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[ScoringRequestOutput]:
@@ -1126,7 +1141,7 @@ class LLM:
         text_1: list[str],
         text_2: list[str],
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[ScoringRequestOutput]:
@@ -1178,7 +1193,7 @@ class LLM:
         /,
         *,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[ScoringRequestOutput]:
@@ -1198,7 +1213,10 @@ class LLM:
             text_2: The texts to pair with the query to form the input
                 to the LLM. See [PromptType][vllm.inputs.PromptType] for
                 more details about the format of each prompts.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
                 generation, if any.
@@ -1379,7 +1397,7 @@ class LLM:
         params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams,
                       Sequence[PoolingParams]],
         *,
-        use_tqdm: bool,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
         prompt_adapter_request: Optional[PromptAdapterRequest],
         tokenization_kwargs: Optional[dict[str, Any]] = None,
@@ -1417,7 +1435,8 @@ class LLM:
         # Add requests to the engine.
         it = prompts
         if use_tqdm:
-            it = tqdm(it, desc="Adding requests")
+            tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
+            it = tqdm_func(it, desc="Adding requests")
 
         for i, prompt in enumerate(it):
             self._add_request(
@@ -1474,12 +1493,15 @@ class LLM:
         return params
 
     def _run_engine(
-            self, *, use_tqdm: bool
+        self,
+        *,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True
     ) -> list[Union[RequestOutput, PoolingRequestOutput]]:
         # Initialize tqdm.
         if use_tqdm:
             num_requests = self.llm_engine.get_num_unfinished_requests()
-            pbar = tqdm(
+            tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
+            pbar = tqdm_func(
                 total=num_requests,
                 desc="Processed prompts",
                 dynamic_ncols=True,
-- 
GitLab


From 646d62f636197d5d809d21090752700e094b8b86 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 9 Jun 2025 22:01:17 -0700
Subject: [PATCH 263/274] [Core] Use tuple for kv cache group block ids
 (#19175)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/core/test_prefix_caching.py         |  44 ++++----
 tests/v1/tpu/worker/test_tpu_model_runner.py |   8 +-
 tests/v1/worker/test_gpu_input_batch.py      |   2 +-
 tests/v1/worker/test_gpu_model_runner.py     |   4 +-
 vllm/v1/core/block_pool.py                   |   8 +-
 vllm/v1/core/kv_cache_coordinator.py         | 101 ++++++++++---------
 vllm/v1/core/kv_cache_manager.py             |  38 ++++---
 vllm/v1/core/sched/output.py                 |   8 +-
 vllm/v1/core/sched/scheduler.py              |  12 +--
 vllm/v1/core/single_type_kv_cache_manager.py |  50 +++++----
 vllm/v1/worker/block_table.py                |   5 +-
 vllm/v1/worker/gpu_input_batch.py            |   2 +-
 12 files changed, 140 insertions(+), 142 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index bf4cb539e..394336624 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -117,7 +117,7 @@ def test_prefill(hash_algo):
     blocks = manager.allocate_slots(req0, 55,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
 
     # Check full block metadata
     parent_block_hash = None
@@ -141,13 +141,13 @@ def test_prefill(hash_algo):
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert computed_blocks.get_block_ids() == [[1, 2, 3]]
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[5]]
+    assert blocks.get_block_ids() == ([5], )
     for block in computed_blocks.blocks[0]:
         assert block.ref_cnt == 2
 
@@ -175,13 +175,13 @@ def test_prefill(hash_algo):
     req2 = make_request("2", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(manager.req_to_block_hashes[req2.request_id]) == 3
-    assert computed_blocks.get_block_ids() == [[1, 2, 3]]
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[6]]
+    assert blocks.get_block_ids() == ([6], )
 
     # Although we only have 6 free blocks, we have 8 blocks in
     # the free block queue due to lazy removal.
@@ -205,7 +205,7 @@ def test_prefill(hash_algo):
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     # This block ID order also checks the eviction order.
-    assert blocks.get_block_ids() == [[7, 8, 9, 10, 4, 5, 6, 3, 2, 1]]
+    assert blocks.get_block_ids() == ([7, 8, 9, 10, 4, 5, 6, 3, 2, 1], )
     assert manager.block_pool.free_block_queue.num_free_blocks == 0
     assert manager.block_pool.free_block_queue.free_list_head is None
     assert manager.block_pool.free_block_queue.free_list_tail is None
@@ -236,8 +236,8 @@ def test_prefill_hybrid_model():
     blocks = manager.allocate_slots(req0, 55,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4], [5, 6, 7, 8],
-                                      [9, 10, 11, 12]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], [5, 6, 7,
+                                                     8], [9, 10, 11, 12])
 
     # Check full block metadata
     parent_block_hash = None
@@ -263,14 +263,14 @@ def test_prefill_hybrid_model():
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert computed_blocks.get_block_ids() == [[1, 2, 3], [0, 6, 7],
-                                               [0, 10, 11]]
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], [0, 6,
+                                                           7], [0, 10, 11])
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[13], [14], [15]]
+    assert blocks.get_block_ids() == ([13], [14], [15])
     for block_per_group in computed_blocks.blocks:
         for block in block_per_group:
             if block != manager.block_pool.null_block:
@@ -374,7 +374,7 @@ def test_prefill_plp():
     blocks = manager.allocate_slots(req0, 55,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
     req0_block_hashes = [b.block_hash for b in blocks.blocks[0]]
 
     # Check full block metadata
@@ -400,13 +400,13 @@ def test_prefill_plp():
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert computed_blocks.get_block_ids() == [[1, 2, 3]]
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[5]]
+    assert blocks.get_block_ids() == ([5], )
     for block in computed_blocks.blocks[0]:
         assert block.ref_cnt == 2
 
@@ -444,7 +444,7 @@ def test_prefill_plp():
     block_ids = blocks.get_block_ids()
     # Duplicate cached blocks have different ids but same hashes vs request #0
     assert [b.block_hash for b in blocks.blocks[0]] == req0_block_hashes
-    assert block_ids != [[1, 2, 3, 4]]
+    assert block_ids != ([1, 2, 3, 4], )
 
     # Request #2 block hashes are valid since request #0 hashes are.
     # Check block reference counts.
@@ -474,7 +474,7 @@ def test_decode():
     blocks = manager.allocate_slots(req0, 55,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
 
     # Append slots without allocating a new block.
     req0.num_computed_tokens = 55
@@ -546,12 +546,12 @@ def test_evict():
     # Touch the first 2 blocks.
     req2 = make_request("2", list(range(2 * 16 + 3)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert computed_blocks.get_block_ids() == [[1, 2]]
+    assert computed_blocks.get_block_ids() == ([1, 2], )
     assert num_computed_tokens == 2 * 16
     blocks = manager.allocate_slots(req2, 3,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[10]]
+    assert blocks.get_block_ids() == ([10], )
     assert manager.block_pool.free_block_queue.num_free_blocks == 7
 
 
@@ -865,7 +865,7 @@ def test_mm_prefix_caching():
     blocks = manager.allocate_slots(req0, 59,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
     req0.num_computed_tokens = 59
 
     # Append slots without allocating a new block.
@@ -926,7 +926,7 @@ def test_cache_key_salting():
     blocks = manager.allocate_slots(req0, 59,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
     req0.num_computed_tokens = 59
 
     # Append slots without allocating a new block.
@@ -1042,7 +1042,7 @@ def test_reset_prefix_cache():
     all_token_ids = full_block_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids)
     blocks = manager.allocate_slots(req0, 55)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
 
     unique_token_ids = [4] * 7
     all_token_ids = full_block_token_ids + unique_token_ids
@@ -1053,7 +1053,7 @@ def test_reset_prefix_cache():
     blocks = manager.allocate_slots(req1, 7,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[5]]
+    assert blocks.get_block_ids() == ([5], )
 
     # Failed to reset prefix cache because some blocks are not freed yet.
     assert not manager.reset_prefix_cache()
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 320ebef40..0e7d305fe 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -71,7 +71,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
                 mm_hashes=[],
                 mm_positions=[],
                 sampling_params=SamplingParams(),
-                block_ids=[[0]],  # block_ids should be list[list[int]]
+                block_ids=([0], ),  # block_ids should be tuple[list[int]]
                 num_computed_tokens=0,
                 lora_request=None,
             ))
@@ -116,10 +116,10 @@ def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
     # This is safe since we currently only use single KV cache groups
     block_table = multi_group_block_table[0]
 
-    # req_state.block_ids is now list[list[int]] for MultiGroupBlockTable
+    # req_state.block_ids is now tuple[list[int], ...] for MultiGroupBlockTable
     # Extract the first group's block IDs
     if isinstance(req_state.block_ids[0], list):
-        # New format: list[list[int]] - extract first group
+        # New format: tuple[list[int], ...] - extract first group
         req_block_ids = req_state.block_ids[0]
     else:
         # Legacy format: list[int] - use directly
@@ -210,7 +210,7 @@ def test_update_states_request_resumed(model_runner):
         req_id=req_id,
         resumed_from_preemption=False,
         new_token_ids=[],
-        new_block_ids=[[]],
+        new_block_ids=([], ),
         num_computed_tokens=0,
     )
 
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 72547e86b..de6ebe4f6 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -203,7 +203,7 @@ def _construct_cached_request_state(req_id_suffix: int):
         sampling_params=_create_sampling_params(),
         mm_inputs=[],
         mm_positions=[],
-        block_ids=[[]],
+        block_ids=([], ),
         generator=None,
         num_computed_tokens=len(output_token_ids),
         output_token_ids=output_token_ids,
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 3d51b53df..fa0bab71b 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -123,7 +123,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
                 mm_hashes=[],
                 mm_positions=[],
                 sampling_params=SamplingParams(),
-                block_ids=[[0]],
+                block_ids=([0], ),
                 num_computed_tokens=0,
                 lora_request=None,
             ))
@@ -251,7 +251,7 @@ def test_update_states_request_resumed(model_runner):
         req_id=req_id,
         resumed_from_preemption=False,
         new_token_ids=[],
-        new_block_ids=[[]],
+        new_block_ids=([], ),
         num_computed_tokens=0,
     )
 
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 3b2a4f936..d21f94727 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -89,8 +89,8 @@ class BlockPool:
                 BlockHashWithGroupId(block_hash, group_id))
             if not cached_blocks_one_group:
                 return None
-            first_block_id = next(iter(cached_blocks_one_group))
-            cached_blocks.append(cached_blocks_one_group[first_block_id])
+            first_block = next(iter(cached_blocks_one_group.values()))
+            cached_blocks.append(first_block)
         return cached_blocks
 
     def cache_full_blocks(
@@ -260,7 +260,7 @@ class BlockPool:
             return True
         return False
 
-    def touch(self, blocks: list[list[KVCacheBlock]]) -> None:
+    def touch(self, blocks: tuple[list[KVCacheBlock], ...]) -> None:
         """Touch a block increases its reference count by 1, and may remove
         the block from the free queue. This is used when a block is hit by
         another request with the same prefix.
@@ -299,7 +299,7 @@ class BlockPool:
             bool: True if the prefix cache is successfully reset,
             False otherwise.
         """
-        num_used_blocks = (self.num_gpu_blocks - self.get_num_free_blocks())
+        num_used_blocks = self.num_gpu_blocks - self.get_num_free_blocks()
         if num_used_blocks != 1:  # The null block is always marked as used
             logger.warning(
                 "Failed to reset prefix cache because some "
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 231bad1df..5620d9bee 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -5,8 +5,7 @@ from typing import Callable, Optional
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
 from vllm.v1.core.single_type_kv_cache_manager import (
-    FullAttentionManager, SingleTypeKVCacheManager,
-    get_manager_for_kv_cache_spec)
+    FullAttentionManager, get_manager_for_kv_cache_spec)
 from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig
 from vllm.v1.request import Request
 
@@ -30,25 +29,21 @@ class KVCacheCoordinator(ABC):
 
         self.block_pool = BlockPool(kv_cache_config.num_blocks, enable_caching,
                                     enable_kv_cache_events)
-        self.single_type_managers: list[SingleTypeKVCacheManager] = []
 
         # Needs special handling for find_longest_cache_hit if eagle is enabled
         self.use_eagle = use_eagle
-
-        for i in range(len(self.kv_cache_config.kv_cache_groups)):
-            kv_cache_spec = self.kv_cache_config.kv_cache_groups[
-                i].kv_cache_spec
-            self.single_type_managers.append(
-                get_manager_for_kv_cache_spec(
-                    kv_cache_spec=kv_cache_spec,
-                    block_pool=self.block_pool,
-                    kv_cache_group_id=i,
-                    caching_hash_fn=caching_hash_fn,
-                ))
+        self.single_type_managers = tuple(
+            get_manager_for_kv_cache_spec(
+                kv_cache_spec=kv_cache_group.kv_cache_spec,
+                block_pool=self.block_pool,
+                kv_cache_group_id=i,
+                caching_hash_fn=caching_hash_fn,
+            ) for i, kv_cache_group in enumerate(
+                self.kv_cache_config.kv_cache_groups))
 
     def get_num_blocks_to_allocate(
             self, request_id: str, num_tokens: int,
-            new_computed_blocks: list[list[KVCacheBlock]]) -> int:
+            new_computed_blocks: tuple[list[KVCacheBlock], ...]) -> int:
         """
         Get the number of blocks needed to be allocated for the request.
 
@@ -70,7 +65,7 @@ class KVCacheCoordinator(ABC):
 
     def save_new_computed_blocks(
             self, request_id: str,
-            new_computed_blocks: list[list[KVCacheBlock]]) -> None:
+            new_computed_blocks: tuple[list[KVCacheBlock], ...]) -> None:
         """
         Add the new computed blocks to the request.
 
@@ -84,7 +79,7 @@ class KVCacheCoordinator(ABC):
                                              new_computed_blocks[i])
 
     def allocate_new_blocks(self, request_id: str,
-                            num_tokens: int) -> list[list[KVCacheBlock]]:
+                            num_tokens: int) -> tuple[list[KVCacheBlock], ...]:
         """
         Allocate new blocks for the request to give it at least `num_tokens` 
         token slots.
@@ -97,11 +92,9 @@ class KVCacheCoordinator(ABC):
         Returns:
             The new allocated blocks.
         """
-        new_blocks = []
-        for manager in self.single_type_managers:
-            new_blocks.append(
-                manager.allocate_new_blocks(request_id, num_tokens))
-        return new_blocks
+        return tuple(
+            manager.allocate_new_blocks(request_id, num_tokens)
+            for manager in self.single_type_managers)
 
     def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
                      num_computed_tokens: int) -> None:
@@ -159,19 +152,20 @@ class KVCacheCoordinator(ABC):
         for manager in self.single_type_managers:
             manager.remove_skipped_blocks(request_id, num_computed_tokens)
 
-    def get_blocks(self, request_id: str) -> list[list[KVCacheBlock]]:
+    def get_blocks(self, request_id: str) -> tuple[list[KVCacheBlock], ...]:
         """
         Get the blocks for the request.
         """
-        return [
+        return tuple(
             manager.req_to_blocks.get(request_id) or []
-            for manager in self.single_type_managers
-        ]
+            for manager in self.single_type_managers)
 
     @abstractmethod
     def find_longest_cache_hit(
-            self, block_hashes: list[BlockHash],
-            max_cache_hit_length: int) -> tuple[list[list[KVCacheBlock]], int]:
+        self,
+        block_hashes: list[BlockHash],
+        max_cache_hit_length: int,
+    ) -> tuple[tuple[list[KVCacheBlock], ...], int]:
         pass
 
 
@@ -195,8 +189,10 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
             "UnitaryKVCacheCoordinator assumes only one kv cache group")
 
     def find_longest_cache_hit(
-            self, block_hashes: list[BlockHash],
-            max_cache_hit_length: int) -> tuple[list[list[KVCacheBlock]], int]:
+        self,
+        block_hashes: list[BlockHash],
+        max_cache_hit_length: int,
+    ) -> tuple[tuple[list[KVCacheBlock], ...], int]:
         hit_blocks = self.single_type_managers[0].find_longest_cache_hit(
             block_hashes=block_hashes,
             max_length=max_cache_hit_length,
@@ -275,11 +271,24 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
             "KVCacheCoordinator assumes the block_size of full attention "
             "layers is divisible by other layers now.")
 
+        if max(self.full_attention_group_ids) < min(self.other_group_ids):
+            self.full_attn_first = True
+        elif max(self.other_group_ids) < min(self.full_attention_group_ids):
+            self.full_attn_first = False
+        else:
+            raise ValueError(
+                "HybridKVCacheCoordinator assumes the full "
+                "attention group ids and other attention group ids "
+                "do not interleave, either full attention group ids "
+                "are before other attention group ids or vice versa."
+                "This is for simplifying merging hit_blocks_full_attn and "
+                "hit_blocks_other_attn to hit_blocks.")
+
     def find_longest_cache_hit(
         self,
         block_hashes: list[BlockHash],
         max_cache_hit_length: int,
-    ) -> tuple[list[list[KVCacheBlock]], int]:
+    ) -> tuple[tuple[list[KVCacheBlock], ...], int]:
         """
         Find the longest cache hit for the request.
 
@@ -318,27 +327,25 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
             ))
         hit_length = len(hit_blocks_other_attn[0]) * self.other_block_size
 
-        # NOTE: the prefix cache hit length must be a multiply of block_size as
+        # NOTE: the prefix cache hit length must be a multiple of block_size as
         # we don't support partial block cache hit yet. The cache hit length
-        # of other attention is ensured to be a multiply of the block size of
+        # of other attention is ensured to be a multiple of the block size of
         # full attention layers in current implementation, because hit_length is
-        # a multiply of other attention's block size, and other attention's
-        # block size is a multiply of full attention's block size (verified in
+        # a multiple of other attention's block size, and other attention's
+        # block size is a multiple of full attention's block size (verified in
         # `verify_and_split_kv_cache_groups`).
         assert hit_length % self.full_attention_block_size == 0
 
         # Truncate the full attention cache hit to the length of the
         # cache hit of the other attention.
-        for i in range(len(hit_blocks_full_attn)):
-            del hit_blocks_full_attn[i][hit_length //
-                                        self.full_attention_block_size:]
+        for group_hit_blocks in hit_blocks_full_attn:
+            del group_hit_blocks[hit_length // self.full_attention_block_size:]
 
         # Merge the hit blocks of full attention and other attention.
-        hit_blocks = hit_blocks_other_attn
-        for group_id, blocks in enumerate(hit_blocks_full_attn):
-            # NOTE: there is only one full attention group in most cases. So
-            # the time complexity of insert is fine.
-            hit_blocks.insert(group_id, blocks)
+        if self.full_attn_first:
+            hit_blocks = hit_blocks_full_attn + hit_blocks_other_attn
+        else:
+            hit_blocks = hit_blocks_other_attn + hit_blocks_full_attn
         return hit_blocks, hit_length
 
 
@@ -351,8 +358,6 @@ def get_kv_cache_coordinator(
                                          use_eagle, enable_caching,
                                          caching_hash_fn,
                                          enable_kv_cache_events)
-    else:
-        return HybridKVCacheCoordinator(kv_cache_config, max_model_len,
-                                        use_eagle, enable_caching,
-                                        caching_hash_fn,
-                                        enable_kv_cache_events)
+    return HybridKVCacheCoordinator(kv_cache_config, max_model_len, use_eagle,
+                                    enable_caching, caching_hash_fn,
+                                    enable_kv_cache_events)
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 35fb189fd..2e09f4c0a 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -21,11 +21,11 @@ logger = init_logger(__name__)
 @dataclass
 class KVCacheBlocks:
     """
-    The allocation result of KVCacheManager, work as the interface between 
-    Scheduler and KVCacheManager, to hide KVCacheManager's internal data 
+    The allocation result of KVCacheManager, work as the interface between
+    Scheduler and KVCacheManager, to hide KVCacheManager's internal data
     structure from the Scheduler.
     """
-    blocks: list[list[KVCacheBlock]]
+    blocks: tuple[list[KVCacheBlock], ...]
     """
     blocks[i][j] refers to the i-th kv_cache_group and the j-th block of tokens.
     We don't use block of tokens as the outer dimension because it assumes all
@@ -37,21 +37,19 @@ class KVCacheBlocks:
     def __add__(self, other: "KVCacheBlocks") -> "KVCacheBlocks":
         """Adds two KVCacheBlocks instances."""
         return KVCacheBlocks(
-            [blk1 + blk2 for blk1, blk2 in zip(self.blocks, other.blocks)])
+            tuple(blk1 + blk2
+                  for blk1, blk2 in zip(self.blocks, other.blocks)))
 
-    def get_block_ids(self) -> list[list[int]]:
+    def get_block_ids(self) -> tuple[list[int], ...]:
         """
         Converts the KVCacheBlocks instance to block_ids.
         
         Returns:
-            list[list[int]]: A two-level list where
-            * the outer list corresponds to KV cache groups
+            tuple[list[int], ...]: A tuple of lists where
+            * the outer tuple corresponds to KV cache groups
             * each inner list contains the block_ids of the blocks in that group
         """
-        block_ids = []
-        for group in self.blocks:
-            block_ids.append([blk.block_id for blk in group])
-        return block_ids
+        return tuple([blk.block_id for blk in group] for group in self.blocks)
 
     def get_unhashed_block_ids(self) -> list[int]:
         """Get block_ids of unhashed blocks from KVCacheBlocks instance."""
@@ -63,7 +61,7 @@ class KVCacheBlocks:
 
     def new_empty(self) -> "KVCacheBlocks":
         """Creates a new KVCacheBlocks instance with no blocks."""
-        return KVCacheBlocks([[] for _ in range(len(self.blocks))])
+        return KVCacheBlocks(tuple([] for _ in range(len(self.blocks))))
 
 
 class KVCacheManager:
@@ -232,9 +230,8 @@ class KVCacheManager:
         if new_computed_blocks is not None:
             new_computed_block_list = new_computed_blocks.blocks
         else:
-            new_computed_block_list = [
-                [] for _ in range(len(self.kv_cache_config.kv_cache_groups))
-            ]
+            new_computed_block_list = tuple(
+                [] for _ in range(len(self.kv_cache_config.kv_cache_groups)))
 
         # Free the blocks that are skipped during the attention computation
         # (e.g., tokens outside the sliding window).
@@ -267,7 +264,7 @@ class KVCacheManager:
         if self.enable_caching:
             self.block_pool.touch(new_computed_block_list)
         else:
-            assert all(not blocks for blocks in new_computed_block_list), (
+            assert not any(new_computed_block_list), (
                 "Computed blocks should be empty when "
                 "prefix caching is disabled")
 
@@ -378,17 +375,18 @@ class KVCacheManager:
         """
         return self.block_pool.take_events()
 
-    def get_block_ids(self, request_id: str) -> list[list[int]]:
+    def get_block_ids(self, request_id: str) -> tuple[list[int], ...]:
         """Get the block ids of a request."""
         return KVCacheBlocks(
             self.coordinator.get_blocks(request_id)).get_block_ids()
 
-    def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
-                     num_computed_tokens: int) -> None:
+    def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:
         """Cache the blocks for the request."""
+        block_hashes = self.req_to_block_hashes[request.request_id]
         self.coordinator.cache_blocks(request, block_hashes,
                                       num_computed_tokens)
 
     def create_empty_block_list(self) -> KVCacheBlocks:
         """Creates a new KVCacheBlocks instance with no blocks."""
-        return KVCacheBlocks([[] for _ in range(self.num_kv_cache_groups)])
+        return KVCacheBlocks(tuple([]
+                                   for _ in range(self.num_kv_cache_groups)))
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index b404c70eb..9b0a439fe 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -27,7 +27,7 @@ class NewRequestData:
     mm_hashes: list[str]
     mm_positions: list[PlaceholderRange]
     sampling_params: SamplingParams
-    block_ids: list[list[int]]
+    block_ids: tuple[list[int], ...]
     num_computed_tokens: int
     lora_request: Optional[LoRARequest]
 
@@ -35,7 +35,7 @@ class NewRequestData:
     def from_request(
         cls,
         request: Request,
-        block_ids: list[list[int]],
+        block_ids: tuple[list[int], ...],
     ) -> NewRequestData:
         return cls(
             req_id=request.request_id,
@@ -86,7 +86,7 @@ class CachedRequestData:
     # request's block IDs instead of appending to the existing block IDs.
     resumed_from_preemption: bool
     new_token_ids: list[int]
-    new_block_ids: list[list[int]]
+    new_block_ids: tuple[list[int], ...]
     num_computed_tokens: int
 
     @classmethod
@@ -95,7 +95,7 @@ class CachedRequestData:
         request: Request,
         resumed_from_preemption: bool,
         new_token_ids: list[int],
-        new_block_ids: list[list[int]],
+        new_block_ids: tuple[list[int], ...],
     ) -> CachedRequestData:
         return cls(
             req_id=request.request_id,
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index b3293d9a5..3d7bbe7e0 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -180,7 +180,7 @@ class Scheduler(SchedulerInterface):
         # uses structured decoding.
         structured_output_request_ids: dict[str, int] = {}
 
-        req_to_new_block_ids: dict[str, list[list[int]]] = {}
+        req_to_new_block_ids: dict[str, tuple[list[int], ...]] = {}
         num_scheduled_tokens: dict[str, int] = {}
         token_budget = self.max_num_scheduled_tokens
         # Encoder-related.
@@ -471,7 +471,7 @@ class Scheduler(SchedulerInterface):
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
-                # Count the number of prifix cached tokens.
+                # Count the number of prefix cached tokens.
                 if request.num_cached_tokens < 0:
                     request.num_cached_tokens = num_computed_tokens
                 # Encoder-related.
@@ -588,7 +588,7 @@ class Scheduler(SchedulerInterface):
         request: Request,
         num_scheduled_tokens: int,
         num_scheduled_spec_tokens: int,
-        new_block_ids: list[list[int]],
+        new_block_ids: tuple[list[int], ...],
         resumed_from_preemption: bool,
     ) -> CachedRequestData:
         # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
@@ -1015,11 +1015,7 @@ class Scheduler(SchedulerInterface):
         num_computed_tokens = min(num_computed_tokens, request.num_tokens)
         if num_computed_tokens == request.num_tokens:
             num_computed_tokens -= 1
-        self.kv_cache_manager.cache_blocks(
-            request,
-            self.kv_cache_manager.req_to_block_hashes[request.request_id],
-            num_computed_tokens,
-        )
+        self.kv_cache_manager.cache_blocks(request, num_computed_tokens)
 
         # Update the request state for scheduling.
         request.num_computed_tokens = num_computed_tokens
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index 98d758f82..95222779c 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -197,7 +197,7 @@ class SingleTypeKVCacheManager(ABC):
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
-    ) -> list[list[KVCacheBlock]]:
+    ) -> tuple[list[KVCacheBlock], ...]:
         """
         Get the longest cache hit prefix of the blocks that is not longer than 
         `max_length`. The prefix should be a common prefix hit for all the 
@@ -222,7 +222,7 @@ class SingleTypeKVCacheManager(ABC):
             element is a list of cached blocks for the i-th kv cache group
             in `kv_cache_group_ids`.
             For example, sliding window manager should return a list like
-            [[NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)]] for block size 4 
+            ([NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)]) for block size 4
             and sliding window 8 and len(kv_cache_group_ids) = 1.
         """
 
@@ -254,27 +254,25 @@ class FullAttentionManager(SingleTypeKVCacheManager):
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
-    ) -> list[list[KVCacheBlock]]:
+    ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, FullAttentionSpec), (
             "FullAttentionManager can only be used for full attention groups")
-        computed_blocks: list[list[KVCacheBlock]] = [
-            [] for _ in range(len(kv_cache_group_ids))
-        ]
+        computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
+            [] for _ in range(len(kv_cache_group_ids)))
         max_num_blocks = max_length // kv_cache_spec.block_size
-        for i in range(max_num_blocks):
-            block_hash = block_hashes[i]
+        for i, block_hash in zip(range(max_num_blocks), block_hashes):
             # block_hashes is a chain of block hashes. If a block hash is not
             # in the cached_block_hash_to_id, the following block hashes are
             # not computed yet for sure.
             if cached_block := block_pool.get_cached_block(
                     block_hash, kv_cache_group_ids):
-                for j in range(len(kv_cache_group_ids)):
-                    computed_blocks[j].append(cached_block[j])
+                for computed, cached in zip(computed_blocks, cached_block):
+                    computed.append(cached)
             else:
                 break
-        if use_eagle and len(computed_blocks[0]) > 0:
-            for j in range(len(kv_cache_group_ids)):
-                computed_blocks[j].pop()
+        if use_eagle and computed_blocks[0]:
+            for computed in computed_blocks:
+                computed.pop()
         return computed_blocks
 
     def remove_skipped_blocks(self, request_id: str,
@@ -311,7 +309,7 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
-    ) -> list[list[KVCacheBlock]]:
+    ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, SlidingWindowSpec), (
             "SlidingWindowManager can only be used for sliding window groups")
 
@@ -332,23 +330,23 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
         # sliding_window_contiguous_blocks),
         # which is good for low cache hit rate scenarios.
         max_num_blocks = max_length // kv_cache_spec.block_size
-        computed_blocks = [[block_pool.null_block] * max_num_blocks
-                           for _ in range(len(kv_cache_group_ids))]
+        computed_blocks = tuple([block_pool.null_block] * max_num_blocks
+                                for _ in range(len(kv_cache_group_ids)))
         num_contiguous_blocks = 0
         match_found = False
         # Search from right to left and early stop when a match is found.
         for i in range(max_num_blocks - 1, -1, -1):
             if cached_block := block_pool.get_cached_block(
                     block_hashes[i], kv_cache_group_ids):
-                for j in range(len(kv_cache_group_ids)):
-                    computed_blocks[j][i] = cached_block[j]
+                for computed, cached in zip(computed_blocks, cached_block):
+                    computed[i] = cached
                 num_contiguous_blocks += 1
-                if (num_contiguous_blocks >= sliding_window_contiguous_blocks):
+                if num_contiguous_blocks >= sliding_window_contiguous_blocks:
                     # Trim the trailing blocks.
                     # E.g., [NULL, NULL, 8, 3, NULL, 9] -> [NULL, NULL, 8, 3]
                     # when sliding_window_contiguous_blocks=2.
-                    for j in range(len(kv_cache_group_ids)):
-                        del computed_blocks[j][i + num_contiguous_blocks:]
+                    for computed in computed_blocks:
+                        del computed[i + num_contiguous_blocks:]
                     match_found = True
                     break
             else:
@@ -356,11 +354,11 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
         if not match_found:
             # The first `num_contiguous_blocks` is a cache hit even if
             # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
-            for j in range(len(kv_cache_group_ids)):
-                del computed_blocks[j][num_contiguous_blocks:]
-        if use_eagle and len(computed_blocks[0]) > 0:
-            for j in range(len(kv_cache_group_ids)):
-                computed_blocks[j].pop()
+            for computed in computed_blocks:
+                del computed[num_contiguous_blocks:]
+        if use_eagle and computed_blocks[0]:
+            for computed in computed_blocks:
+                computed.pop()
         return computed_blocks
 
     def remove_skipped_blocks(self, request_id: str,
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 5cd5674fb..8f4e8d64c 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -112,11 +112,12 @@ class MultiGroupBlockTable:
             for block_size in block_sizes
         ]
 
-    def append_row(self, block_ids: list[list[int]], row_idx: int) -> None:
+    def append_row(self, block_ids: tuple[list[int], ...],
+                   row_idx: int) -> None:
         for i, block_table in enumerate(self.block_tables):
             block_table.append_row(block_ids[i], row_idx)
 
-    def add_row(self, block_ids: list[list[int]], row_idx: int) -> None:
+    def add_row(self, block_ids: tuple[list[int], ...], row_idx: int) -> None:
         for i, block_table in enumerate(self.block_tables):
             block_table.add_row(block_ids[i], row_idx)
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 34737029f..ebb770a7d 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -30,7 +30,7 @@ class CachedRequestState:
     sampling_params: SamplingParams
     generator: Optional[torch.Generator]
 
-    block_ids: list[list[int]]
+    block_ids: tuple[list[int], ...]
     num_computed_tokens: int
     output_token_ids: list[int]
 
-- 
GitLab


From 1efef716458ab03e0954ef2825ac71cf4f81cf9d Mon Sep 17 00:00:00 2001
From: Li Wang <wangli858794774@gmail.com>
Date: Tue, 10 Jun 2025 13:39:37 +0800
Subject: [PATCH 264/274] [Bugfix] Fix modelscope token passed in (#19389)

Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/transformers_utils/config.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index d66e296fd..52a7a903c 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -143,7 +143,9 @@ def list_repo_files(
                     modelscope_list_repo_files)
                 return modelscope_list_repo_files(repo_id,
                                                   revision=revision,
-                                                  token=token)
+                                                  token=os.getenv(
+                                                      "MODELSCOPE_API_TOKEN",
+                                                      None))
             return hf_list_repo_files(repo_id,
                                       revision=revision,
                                       repo_type=repo_type,
-- 
GitLab


From 319cb1e351736890e1615ab827639d7fd05fc988 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Tue, 10 Jun 2025 07:44:59 +0200
Subject: [PATCH 265/274] [Core] Batch multi modal input using pinned memory
 (#19169)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 vllm/multimodal/inputs.py          | 19 ++++++++++++++-----
 vllm/v1/worker/gpu_model_runner.py |  6 ++++--
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 0bf5b1cf1..5cb720381 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -680,7 +680,8 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
         return self._items_by_modality.keys()
 
     @staticmethod
-    def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
+    def _try_stack(nested_tensors: NestedTensors,
+                   pin_memory: bool = False) -> NestedTensors:
         """
         Stack the inner dimensions that have the same shape in
         a nested list of tensors.
@@ -697,7 +698,9 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
         if isinstance(nested_tensors, (int, float)):
             return torch.tensor(nested_tensors)
 
-        stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors]
+        stacked = [
+            MultiModalKwargs._try_stack(t, pin_memory) for t in nested_tensors
+        ]
         if not is_list_of(stacked, torch.Tensor, check="all"):
             # Only tensors (not lists) can be stacked.
             return stacked
@@ -713,10 +716,16 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
             # The tensors have incompatible shapes and can't be stacked.
             return tensors_
 
-        return torch.stack(tensors_)
+        outputs = torch.empty(len(tensors_),
+                              *tensors_[0].shape,
+                              dtype=tensors_[0].dtype,
+                              device=tensors_[0].device,
+                              pin_memory=pin_memory)
+        return torch.stack(tensors_, out=outputs)
 
     @staticmethod
-    def batch(inputs_list: list["MultiModalKwargs"]) -> BatchedTensorInputs:
+    def batch(inputs_list: list["MultiModalKwargs"],
+              pin_memory: bool = False) -> BatchedTensorInputs:
         """
         Batch multiple inputs together into a dictionary.
 
@@ -738,7 +747,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
                 item_lists[k].append(v)
 
         return {
-            k: MultiModalKwargs._try_stack(item_list)
+            k: MultiModalKwargs._try_stack(item_list, pin_memory)
             for k, item_list in item_lists.items()
         }
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 175404efe..b1bc727e1 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -962,7 +962,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
         encoder_outputs = []
         for grouped_mm_inputs in grouped_mm_inputs_list:
-            batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
+            batched_mm_inputs = MultiModalKwargs.batch(
+                grouped_mm_inputs, pin_memory=self.pin_memory)
             batched_mm_inputs = MultiModalKwargs.as_kwargs(
                 batched_mm_inputs,
                 device=self.device,
@@ -1989,7 +1990,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             ).multi_modal_data
 
             batched_dummy_mm_inputs = MultiModalKwargs.batch(
-                [dummy_mm_kwargs] * max_num_mm_items)
+                [dummy_mm_kwargs] * max_num_mm_items,
+                pin_memory=self.pin_memory)
             batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
                 batched_dummy_mm_inputs,
                 device=self.device,
-- 
GitLab


From a3f66e75d150761fbde323588c9a3484624898e5 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 10 Jun 2025 02:06:36 -0400
Subject: [PATCH 266/274] Add security warning to bug report template (#19365)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .github/ISSUE_TEMPLATE/400-bug-report.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml
index f05be2ba8..8c5c28cd7 100644
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@@ -8,6 +8,16 @@ body:
   attributes:
     value: >
       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: markdown
+  attributes:
+    value: |
+      ⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
+      - API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
+      - Passwords or authentication credentials
+      - Private URLs or endpoints
+      - Personal or confidential data
+      
+      Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
 - type: textarea
   attributes:
     label: Your current environment
-- 
GitLab


From 6b1391ca7ece0350ecbe07e91f2c91d901c7f525 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Tue, 10 Jun 2025 14:12:42 +0800
Subject: [PATCH 267/274] [Misc] refactor neuron_multimodal and profiling
 (#19397)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 examples/offline_inference/neuron_multimodal.py       | 6 +++++-
 examples/offline_inference/profiling_tpu/profiling.py | 8 ++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/examples/offline_inference/neuron_multimodal.py b/examples/offline_inference/neuron_multimodal.py
index 6ff8faabd..26f7505f2 100644
--- a/examples/offline_inference/neuron_multimodal.py
+++ b/examples/offline_inference/neuron_multimodal.py
@@ -64,7 +64,7 @@ def print_outputs(outputs):
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
 
-if __name__ == "__main__":
+def main():
     assert (
         len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS)
     ), f"""Text, image prompts and sampling parameters should have the 
@@ -104,3 +104,7 @@ if __name__ == "__main__":
     # test batch-size = 4
     outputs = llm.generate(batched_inputs, batched_sample_params)
     print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py
index 5200be826..dfcbd8c8d 100644
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -70,7 +70,7 @@ def main(args: argparse.Namespace):
     return
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = FlexibleArgumentParser(
         description="Benchmark the latency of processing a single batch of "
         "requests till completion."
@@ -102,5 +102,9 @@ if __name__ == "__main__":
     )
 
     parser = EngineArgs.add_cli_args(parser)
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
     main(args)
-- 
GitLab


From 32b3946bb4fb8c338ae6d1c04b7b1ef50896db55 Mon Sep 17 00:00:00 2001
From: Anna Pendleton <pendleton@google.com>
Date: Mon, 9 Jun 2025 23:16:09 -0700
Subject: [PATCH 268/274] Add clear documentation around the impact of
 debugging flag (#19369)

Signed-off-by: Anna Pendleton <pendleton@google.com>
---
 docs/usage/troubleshooting.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index 889cfccda..e9ab425a1 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -40,7 +40,7 @@ If other strategies don't solve the problem, it's likely that the vLLM instance
 - `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging.
 - `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem.
 - `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
-- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs.
+- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. Do not use this flag unless absolutely needed for debugging, it will cause significant delays in startup time.
 
 ## Incorrect network setup
 
-- 
GitLab


From 9368cc90b2d2aaa359398a1b97c2313fb39c8de7 Mon Sep 17 00:00:00 2001
From: Louie Tsai <louie.tsai@intel.com>
Date: Mon, 9 Jun 2025 23:22:05 -0700
Subject: [PATCH 269/274] Automatically bind CPU OMP Threads of a rank to CPU
 ids of a NUMA node. (#17930)

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
---
 docs/getting_started/installation/cpu.md | 21 +++++++--
 requirements/cpu.txt                     |  2 +
 vllm/envs.py                             |  8 +++-
 vllm/platforms/cpu.py                    |  3 ++
 vllm/v1/worker/cpu_worker.py             | 55 +++++++++++++++++++++++-
 vllm/worker/cpu_worker.py                | 53 ++++++++++++++++++++++-
 6 files changed, 134 insertions(+), 8 deletions(-)

diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 18c96b264..00bb5cae4 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -110,8 +110,9 @@ vLLM CPU backend supports the following vLLM features:
 
 ## Related runtime environment variables
 
-- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
-- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
+- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
+- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node. By setting to `all`, the OpenMP threads of each rank uses all CPU cores available on the system. Default value is `auto`.
+- `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `0`.
 - `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
 
 ## Performance tips
@@ -133,7 +134,15 @@ export VLLM_CPU_OMP_THREADS_BIND=0-29
 vllm serve facebook/opt-125m
 ```
 
-- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
+ or using default auto thread binding:
+
+```console
+export VLLM_CPU_KVCACHE_SPACE=40
+export VLLM_CPU_NUM_OF_RESERVED_CPU=2
+vllm serve facebook/opt-125m
+```
+
+- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
 
 ```console
 $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
@@ -178,6 +187,12 @@ $ python examples/offline_inference/basic/basic.py
     VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
     ```
 
+    or using default auto thread binding:
+
+    ```console
+    VLLM_CPU_KVCACHE_SPACE=40 vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
+    ```
+
   - For each thread id list in `VLLM_CPU_OMP_THREADS_BIND`, users should guarantee threads in the list belong to a same NUMA node.
 
   - Meanwhile, users should also take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, TP worker will be killed due to out-of-memory.
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index e43b44397..d7b0fc6d8 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -27,3 +27,5 @@ triton==3.2.0; platform_machine == "x86_64"
 # Intel Extension for PyTorch, only for x86_64 CPUs
 intel-openmp==2024.2.1; platform_machine == "x86_64"
 intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64"
+py-libnuma; platform_system != "Darwin"
+psutil; platform_system != "Darwin"
diff --git a/vllm/envs.py b/vllm/envs.py
index 6f876d3df..80c5f289b 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -44,6 +44,7 @@ if TYPE_CHECKING:
     VLLM_PP_LAYER_PARTITION: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_CPU_OMP_THREADS_BIND: str = ""
+    VLLM_CPU_NUM_OF_RESERVED_CPU: int = 0
     VLLM_CPU_MOE_PREPACK: bool = True
     VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
     VLLM_XLA_CHECK_RECOMPILATION: bool = False
@@ -422,7 +423,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
     # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
     "VLLM_CPU_OMP_THREADS_BIND":
-    lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"),
+    lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"),
+
+    # (CPU backend only) CPU cores not used by OMP threads .
+    # Those CPU cores will not be used by OMP threads of a rank.
+    "VLLM_CPU_NUM_OF_RESERVED_CPU":
+    lambda: int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")),
 
     # (CPU backend only) whether to use prepack for MoE layer. This will be
     # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 71c964fbf..27c591e3b 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -208,6 +208,9 @@ class CpuPlatform(Platform):
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
+        # Share the cpusets list among ranks by spawning process instead
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
         # Intel OpenMP setting
         ld_prealod_str = os.getenv("LD_PRELOAD", "")
         if "libiomp5.so" in ld_prealod_str:
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index 0b710b7bc..9a35e8812 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
+from importlib import util
 from typing import Optional
 
 import torch
@@ -38,10 +39,14 @@ class CPUWorker(Worker):
     def init_device(self):
         # Setup OpenMP threads affinity.
         omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
-        if omp_cpuids == "all":
-            self.local_omp_cpuid = "all"
+        self.local_omp_cpuid = "all"
+        if omp_cpuids == "auto":
+            self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes(
+            )
         else:
             self.local_omp_cpuid = omp_cpuids.split("|")[self.rank]
+
+        if self.local_omp_cpuid != "all":
             ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
             if ret:
                 logger.info(ret)
@@ -99,3 +104,49 @@ class CPUWorker(Worker):
 
         assert isinstance(output, ModelRunnerOutput)
         return output if self.is_driver_worker else None
+
+    def get_cpus_id_binding_based_on_numa_nodes(self) -> str:
+        """Return CPUs id binding based on NUMA nodes.
+        """
+        rank_to_cpus = self.local_omp_cpuid
+        # Setup OpenMP thread affinity based on NUMA nodes automatically
+        world_size = self.vllm_config.parallel_config.world_size
+        libnuma_found = util.find_spec("numa") is not None
+        psutil_found = util.find_spec("psutil") is not None
+        if libnuma_found and psutil_found:
+            import psutil
+            from numa import info
+            cpu_count = psutil.cpu_count(logical=False)
+            cpus_allow_list = psutil.Process().cpu_affinity()
+            numa_size = info.get_num_configured_nodes()
+            cpu_count_per_numa = cpu_count // numa_size
+            num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU,
+                                      cpu_count_per_numa // 2)
+
+            # check allow node_to_cpus list
+            node_to_cpus = []
+            for i in range(numa_size):
+                node_intersect = set(
+                    info.node_to_cpus(i)).intersection(cpus_allow_list)
+                if bool(node_intersect):
+                    node_to_cpus.append(list(node_intersect))
+
+            if world_size > len(node_to_cpus):
+                logger.error(
+                    "Auto thread-binding failed due to "
+                    "world size: %d is larger than "
+                    "allowed NUMA nodes number: %d."
+                    "Please try to bind threads manually.", world_size,
+                    len(node_to_cpus))
+            else:
+                end = cpu_count_per_numa - num_of_reserved_cpu
+                rank_to_cpus_list = node_to_cpus[self.rank][:end]
+                rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list)
+                logger.info("auto thread-binding list: %s", rank_to_cpus)
+        else:
+            logger.warning(
+                "Auto thread-binding is not supported due to "
+                "the lack of package numa and psutil,"
+                "fallback to no thread-binding. To get better performance,"
+                "please try to manually bind threads.")
+        return rank_to_cpus
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index b04a9a1eb..9e834befd 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A CPU worker class."""
 import os
+from importlib import util
 from typing import Dict, List, Optional, Set, Tuple, Type
 
 import torch
@@ -156,8 +157,10 @@ class CPUWorker(LocalOrDistributedWorkerBase):
 
         # Setup OpenMP threads affinity.
         omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
-        if omp_cpuids == "all":
-            self.local_omp_cpuid = "all"
+        self.local_omp_cpuid = "all"
+        if omp_cpuids == "auto":
+            self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes(
+            )
         else:
             self.local_omp_cpuid = omp_cpuids.split("|")[rank]
 
@@ -399,3 +402,49 @@ class CPUWorker(LocalOrDistributedWorkerBase):
         return CPUCacheEngine.get_cache_block_size(
             self.cache_config.block_size, self.cache_config.cache_dtype,
             self.model_config, self.parallel_config)
+
+    def get_cpus_id_binding_based_on_numa_nodes(self) -> str:
+        """Return CPUs id binding based on NUMA nodes.
+        """
+        rank_to_cpus = self.local_omp_cpuid
+        # Setup OpenMP thread affinity based on NUMA nodes automatically
+        world_size = self.vllm_config.parallel_config.world_size
+        libnuma_found = util.find_spec("numa") is not None
+        psutil_found = util.find_spec("psutil") is not None
+        if libnuma_found and psutil_found:
+            import psutil
+            from numa import info
+            cpu_count = psutil.cpu_count(logical=False)
+            cpus_allow_list = psutil.Process().cpu_affinity()
+            numa_size = info.get_num_configured_nodes()
+            cpu_count_per_numa = cpu_count // numa_size
+            num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU,
+                                      cpu_count_per_numa // 2)
+
+            # check allow node_to_cpus list
+            node_to_cpus = []
+            for i in range(numa_size):
+                node_intersect = set(
+                    info.node_to_cpus(i)).intersection(cpus_allow_list)
+                if bool(node_intersect):
+                    node_to_cpus.append(list(node_intersect))
+
+            if world_size > len(node_to_cpus):
+                logger.error(
+                    "Auto thread-binding failed due to "
+                    "world size: %d is larger than "
+                    "allowed NUMA nodes number: %d."
+                    "Please try to bind threads manually.", world_size,
+                    len(node_to_cpus))
+            else:
+                end = cpu_count_per_numa - num_of_reserved_cpu
+                rank_to_cpus_list = node_to_cpus[self.rank][:end]
+                rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list)
+                logger.info("auto thread-binding list: %s", rank_to_cpus)
+        else:
+            logger.warning(
+                "Auto thread-binding is not supported due to "
+                "the lack of package numa and psutil,"
+                "fallback to no thread-binding. To get better performance,"
+                "please try to manually bind threads.")
+        return rank_to_cpus
-- 
GitLab


From 5f1ac1e1d14ce239ea6ea99e276ad90cdc342132 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 10 Jun 2025 16:30:20 +0800
Subject: [PATCH 270/274] Revert "[v1] Add fp32 support to v1 engine through
 flex attn" (#19404)

---
 .../attention/test_attention_selector.py      | 28 -------------------
 tests/v1/worker/test_gpu_model_runner.py      |  6 ----
 vllm/engine/arg_utils.py                      |  7 +++++
 vllm/platforms/cuda.py                        |  4 ---
 4 files changed, 7 insertions(+), 38 deletions(-)

diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index ea5c8013a..f3e641557 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -183,34 +183,6 @@ def test_env(
                     assert backend.get_name() == expected
 
 
-@pytest.mark.parametrize("device", ["cpu", "cuda"])
-@pytest.mark.parametrize("use_v1", [True, False])
-def test_fp32_fallback(
-    device: str,
-    use_v1: bool,
-    monkeypatch: pytest.MonkeyPatch,
-):
-    """Test attention backend selection with fp32."""
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
-
-        if device == "cpu":
-            with patch("vllm.attention.selector.current_platform",
-                       CpuPlatform()):
-                backend = get_attn_backend(16, torch.float32, torch.float32,
-                                           16, False)
-            assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
-                    if use_v1 else "TORCH_SDPA")
-
-        elif device == "cuda":
-            with patch("vllm.attention.selector.current_platform",
-                       CudaPlatform()):
-                backend = get_attn_backend(16, torch.float32, torch.float32,
-                                           16, False)
-            assert (backend.get_name() == "FLEX_ATTENTION"
-                    if use_v1 else "XFORMERS")
-
-
 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
     """Test FlashAttn validation."""
     # TODO: When testing for v1, pipe in `use_v1` as an argument to
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index fa0bab71b..994432dfd 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -4,7 +4,6 @@
 import random
 
 import pytest
-import torch
 
 from vllm.attention import Attention
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
@@ -400,7 +399,6 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
 
 
 def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
-    torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     error_msg = f"{layer_1} must come before the current layer"
@@ -429,7 +427,6 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
 
 
 def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
-    torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     invalid_layer = "model.layers.0.cross_attn.attn"
@@ -458,7 +455,6 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
 
 
 def test_init_kv_cache_with_kv_sharing_target_same_as_current():
-    torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     error_msg = f"{layer_1} cannot be the same as the current layer"
@@ -487,7 +483,6 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():
 
 
 def test_init_kv_cache_without_kv_sharing():
-    torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     vllm_config = get_vllm_config()
@@ -555,7 +550,6 @@ def test_init_kv_cache_without_kv_sharing():
 
 
 def test_init_kv_cache_with_kv_sharing_valid():
-    torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     vllm_config = get_vllm_config()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 837c9b1b1..4ce1b41e4 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1337,6 +1337,13 @@ class EngineArgs:
                                recommend_to_remove=False)
             return False
 
+        # Only Fp16 and Bf16 dtypes since we only support FA.
+        V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
+        if model_config.dtype not in V1_SUPPORTED_DTYPES:
+            _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
+                               recommend_to_remove=False)
+            return False
+
         # No Embedding Models so far.
         if model_config.task not in ["generate"]:
             _raise_or_fallback(feature_name=f"--task {model_config.task}",
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 20c06c5c9..48d1aacba 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -233,10 +233,6 @@ class CudaPlatformBase(Platform):
                 logger.info_once("Using Triton backend on V1 engine.")
                 return ("vllm.v1.attention.backends."
                         "triton_attn.TritonAttentionBackend")
-            if dtype not in (torch.float16, torch.bfloat16):
-                logger.info_once(
-                    f"Using FlexAttenion backend for {dtype} on V1 engine.")
-                return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
             if cls.is_device_capability(100):
                 # Prefer FlashInfer for V1 on Blackwell GPUs if installed
                 try:
-- 
GitLab


From 467bef18a353e8ca133a0a5611d6f4384ea351b6 Mon Sep 17 00:00:00 2001
From: Rachel Guo <35738743+YUNQIUGUO@users.noreply.github.com>
Date: Tue, 10 Jun 2025 01:48:51 -0700
Subject: [PATCH 271/274] [BugFix][FlashInfer] Fix attention backend interface
 mismatch with unexpected keyword `use_irope` (#19134)

Signed-off-by: Yunqiu Guo <guorachel@meta.com>
---
 vllm/v1/attention/backends/flashinfer.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index f1b61c152..b15bb4b31 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -508,7 +508,12 @@ class FlashInferImpl(AttentionImpl):
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[int] = None,
+        use_irope: bool = False,
     ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in FlashInfer is not supported yet, it will fall"
+                " back to global attention for long context.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
-- 
GitLab


From e4248849ec1dea54851a1d1769c86e1504f183ea Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Tue, 10 Jun 2025 20:02:40 +0800
Subject: [PATCH 272/274] [BugFix][CPU] Fix CPU CI by ignore collecting
 test_pixtral (#19411)

Signed-off-by: jiang.li <jiang1.li@intel.com>
---
 .buildkite/scripts/hardware_ci/run-cpu-test.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 61aa7df13..bbcde4009 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -43,7 +43,10 @@ function cpu_tests() {
     pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
     pytest -v -s tests/models/language/generation -m cpu_model
     pytest -v -s tests/models/language/pooling -m cpu_model
-    pytest -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_mllama.py -m cpu_model"
+    pytest -v -s tests/models/multimodal/generation \
+                --ignore=tests/models/multimodal/generation/test_mllama.py \
+                --ignore=tests/models/multimodal/generation/test_pixtral.py \
+                -m cpu_model"
 
   # Run compressed-tensor test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
-- 
GitLab


From 64a9af5afa976a4cfdbbd5b63ff0ea88953f4203 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 10 Jun 2025 20:06:08 +0800
Subject: [PATCH 273/274] Simplify ep kernels installation (#19412)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tools/ep_kernels/README.md                   | 10 +++---
 tools/ep_kernels/configure_system_drivers.sh |  7 ++++
 tools/ep_kernels/install_python_libraries.sh | 36 ++++++++------------
 tools/ep_kernels/install_system_drivers.sh   | 24 -------------
 tools/ep_kernels/install_system_libraries.sh | 18 ----------
 5 files changed, 26 insertions(+), 69 deletions(-)
 create mode 100644 tools/ep_kernels/configure_system_drivers.sh
 delete mode 100644 tools/ep_kernels/install_system_drivers.sh
 delete mode 100644 tools/ep_kernels/install_system_libraries.sh

diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md
index 5c98e999d..f1479146f 100644
--- a/tools/ep_kernels/README.md
+++ b/tools/ep_kernels/README.md
@@ -1,11 +1,10 @@
 Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package.
 
-Here we break down the requirements in 3 steps:
+Here we break down the requirements in 2 steps:
 1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
-2. Build and install the system libraries (GDR Copy). This step requires root access. You can do it inside a Docker container so that they can be shipped as a single image.
-3. Build and install the system drivers (GDR Copy, and necessary modifications to NVIDIA driver to enable IBGDA). This step requires root access, and must be done on the host machine.
+2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
 
-2 and 3 are necessary for multi-node deployment.
+2 is necessary for multi-node deployment.
 
 All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.
 
@@ -21,7 +20,6 @@ bash install_python_libraries.sh
 
 ```bash
 bash install_python_libraries.sh
-sudo bash install_system_libraries.sh
-sudo bash install_system_drivers.sh
+sudo bash configure_system_drivers.sh
 sudo reboot # Reboot is required to load the new driver
 ```
diff --git a/tools/ep_kernels/configure_system_drivers.sh b/tools/ep_kernels/configure_system_drivers.sh
new file mode 100644
index 000000000..cf15c1dac
--- /dev/null
+++ b/tools/ep_kernels/configure_system_drivers.sh
@@ -0,0 +1,7 @@
+set -ex
+
+# turn on IBGDA
+echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
+update-initramfs -u
+
+echo "Please reboot the system to apply the changes"
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index e5632f4b5..83643c084 100644
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -13,16 +13,6 @@ fi
 # install dependencies if not installed
 pip3 install cmake torch ninja
 
-# build gdrcopy, required by nvshmem
-pushd $WORKSPACE
-wget https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.4.4.tar.gz
-mkdir -p gdrcopy_src
-tar -xvf v2.4.4.tar.gz -C gdrcopy_src --strip-components=1
-pushd gdrcopy_src
-make -j$(nproc)
-make prefix=$WORKSPACE/gdrcopy_install install
-popd
-
 # build nvshmem
 pushd $WORKSPACE
 mkdir -p nvshmem_src
@@ -34,26 +24,30 @@ git init
 git apply -vvv nvshmem.patch
 
 # assume CUDA_HOME is set correctly
-export GDRCOPY_HOME=$WORKSPACE/gdrcopy_install
+if [ -z "$CUDA_HOME" ]; then
+    echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
+    exit 1
+fi
+
+# disable all features except IBGDA
+export NVSHMEM_IBGDA_SUPPORT=1
+
 export NVSHMEM_SHMEM_SUPPORT=0
 export NVSHMEM_UCX_SUPPORT=0
 export NVSHMEM_USE_NCCL=0
-export NVSHMEM_IBGDA_SUPPORT=1
 export NVSHMEM_PMIX_SUPPORT=0
 export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
-export NVSHMEM_USE_GDRCOPY=1
-export NVSHMEM_IBRC_SUPPORT=1
-
-# remove MPI dependency
+export NVSHMEM_USE_GDRCOPY=0
+export NVSHMEM_IBRC_SUPPORT=0
 export NVSHMEM_BUILD_TESTS=0
 export NVSHMEM_BUILD_EXAMPLES=0
 export NVSHMEM_MPI_SUPPORT=0
+export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
+export NVSHMEM_BUILD_TXZ_PACKAGE=0
+export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
 
-cmake -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
-
-cd $WORKSPACE/nvshmem_build/
-make -j$(nproc)
-make install
+cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
+cmake --build $WORKSPACE/nvshmem_build/ --target install
 
 popd
 
diff --git a/tools/ep_kernels/install_system_drivers.sh b/tools/ep_kernels/install_system_drivers.sh
deleted file mode 100644
index 8b0669ef4..000000000
--- a/tools/ep_kernels/install_system_drivers.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-set -ex
-
-# prepare workspace directory
-WORKSPACE=$1
-if [ -z "$WORKSPACE" ]; then
-    export WORKSPACE=$(pwd)/ep_kernels_workspace
-fi
-
-if [ ! -d "$WORKSPACE" ]; then
-    mkdir -p $WORKSPACE
-fi
-
-# build and install gdrcopy driver
-pushd $WORKSPACE
-cd gdrcopy_src
-./insmod.sh
-# run gdrcopy_copybw to test the installation
-$WORKSPACE/gdrcopy_install/bin/gdrcopy_copybw
-
-# turn on IBGDA
-echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
-update-initramfs -u
-
-echo "Please reboot the system to apply the changes"
diff --git a/tools/ep_kernels/install_system_libraries.sh b/tools/ep_kernels/install_system_libraries.sh
deleted file mode 100644
index c148d5443..000000000
--- a/tools/ep_kernels/install_system_libraries.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-set -ex
-
-# prepare workspace directory
-WORKSPACE=$1
-if [ -z "$WORKSPACE" ]; then
-    export WORKSPACE=$(pwd)/ep_kernels_workspace
-fi
-
-if [ ! -d "$WORKSPACE" ]; then
-    mkdir -p $WORKSPACE
-fi
-
-# build and install gdrcopy system packages
-pushd $WORKSPACE
-cd gdrcopy_src/packages
-apt install devscripts -y
-CUDA=${CUDA_HOME:-/usr/local/cuda} ./build-deb-packages.sh
-dpkg -i *.deb
-- 
GitLab


From b6553be1bc75f046b00046a4ad7576364d03c835 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 10 Jun 2025 21:51:49 +0800
Subject: [PATCH 274/274] [Misc] Slight improvement of the BNB  (#19418)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/layers/quantization/bitsandbytes.py | 4 +---
 vllm/model_executor/model_loader/bitsandbytes_loader.py | 3 ++-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 38935bc96..53ba84ea8 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -71,9 +71,7 @@ class BitsAndBytesConfig(QuantizationConfig):
 
     @staticmethod
     def get_config_filenames() -> list[str]:
-        return [
-            "adapter_config.json",
-        ]
+        return []
 
     @classmethod
     def from_config(cls, config: dict[str, Any]) -> "BitsAndBytesConfig":
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index ebbb021ca..068a4e355 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -392,7 +392,8 @@ class BitsAndBytesModelLoader(BaseModelLoader):
     def _get_bnb_target_modules(self, model: nn.Module) -> None:
 
         for name, module in model.named_modules():
-            if isinstance(module, (LinearBase, )):
+            if (isinstance(module, LinearBase) and
+                    hasattr(module.quant_method, "quant_config")):
                 if modules_info := self.modules_mapping.get_sub_modules(name):
                     # Map vllm's names to transformers's names.
                     rep_name, sub_modules = modules_info
-- 
GitLab