From 3194039c0ee82685af434c9f8023304b4a45124b Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Sat, 1 Feb 2025 11:16:19 -0500
Subject: [PATCH 01/65] Apply torch.compile to fused_moe/grouped_topk (#12637)

---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 1 +
 vllm/model_executor/models/deepseek_v3.py         | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index c966be99e..c80e6bf07 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -759,6 +759,7 @@ def fused_topk(
 
 
 # This is used by the Deepseek-V2 and Deepseek-V3 model
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def grouped_topk(hidden_states: torch.Tensor,
                  gating_output: torch.Tensor,
                  topk: int,
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
index f6ab53c85..06ea3dab9 100644
--- a/vllm/model_executor/models/deepseek_v3.py
+++ b/vllm/model_executor/models/deepseek_v3.py
@@ -27,6 +27,7 @@ from torch import nn
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
@@ -566,8 +567,7 @@ class DeepseekV3DecoderLayer(nn.Module):
         return hidden_states, residual
 
 
-# TODO(simon): check whether we support torch compile for Deepseek V3
-# @support_torch_compile
+@support_torch_compile
 class DeepseekV3Model(nn.Module):
 
     fall_back_to_pt_during_load = False
-- 
GitLab


From b4e5c03306ebdf58bce503c73b4f3dc5592df114 Mon Sep 17 00:00:00 2001
From: Vicente Herrera <vicenteherrera@vicenteherrera.com>
Date: Sat, 1 Feb 2025 18:17:29 +0100
Subject: [PATCH 02/65] doc: fixing minor typo in readme.md (#12643)

Word "evolved" was mistyped

Signed-off-by: Vicente Herrera <vicenteherrera@vicenteherrera.com>

---------

Signed-off-by: Vicente Herrera <vicenteherrera@vicenteherrera.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5fd30f2b1..80c3ba7d1 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ Easy, fast, and cheap LLM serving for everyone
 ## About
 vLLM is a fast and easy-to-use library for LLM inference and serving.
 
-Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.
+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
 
 vLLM is fast with:
 
-- 
GitLab


From baaa2b24da86d63965dbffc34c97c7c4b50288db Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Sun, 2 Feb 2025 15:29:56 +0800
Subject: [PATCH 03/65] [Bugfix] fix moe_wna16 get_quant_method (#12648)

Fix https://github.com/vllm-project/vllm/issues/12647
The `get_quant_method` of `moe_wna16` always return moe method,
GPTQ-based linear method or AWQ-based linear method, even when the
target module is attention layer.


https://github.com/vllm-project/vllm/blob/baeded25699f9f4851843306f27f685c4d4ee7c5/vllm/attention/layer.py#L86-L92

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
---
 .../layers/quantization/moe_wna16.py          | 27 +++++++++----------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 8cd9c0a7e..11a9d4ac5 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -6,16 +6,13 @@ from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
-from vllm.model_executor.layers.quantization.awq import (AWQConfig,
-                                                         AWQLinearMethod)
-from vllm.model_executor.layers.quantization.awq_marlin import (
-    AWQMarlinConfig, AWQMarlinLinearMethod)
+from vllm.model_executor.layers.quantization.awq import AWQConfig
+from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.layers.quantization.gptq import (GPTQConfig,
-                                                          GPTQLinearMethod)
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQMarlinConfig, GPTQMarlinLinearMethod)
+    GPTQMarlinConfig)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 
@@ -131,18 +128,18 @@ class MoeWNA16Config(QuantizationConfig):
         else:
             if self.linear_quant_method == "gptq":
                 if self.use_marlin:
-                    return GPTQMarlinLinearMethod(
-                        GPTQMarlinConfig.from_config(self.full_config))
+                    return GPTQMarlinConfig.from_config(
+                        self.full_config).get_quant_method(layer, prefix)
                 else:
-                    return GPTQLinearMethod(
-                        GPTQConfig.from_config(self.full_config))
+                    return GPTQConfig.from_config(
+                        self.full_config).get_quant_method(layer, prefix)
             elif self.linear_quant_method == "awq":
                 if self.use_marlin:
-                    return AWQMarlinLinearMethod(
-                        AWQMarlinConfig.from_config(self.full_config))
+                    return AWQMarlinConfig.from_config(
+                        self.full_config).get_quant_method(layer, prefix)
                 else:
-                    return AWQLinearMethod(
-                        AWQConfig.from_config(self.full_config))
+                    return AWQConfig.from_config(
+                        self.full_config).get_quant_method(layer, prefix)
             else:
                 raise ValueError("moe_wna16 only support gptq and awq.")
 
-- 
GitLab


From e497f33491671abbf94a3e563d55ca2818ee09db Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Sun, 2 Feb 2025 02:35:50 -0500
Subject: [PATCH 04/65] [Core] Silence unnecessary deprecation warnings
 (#12620)

I noticed during testing that I was getting a lot of these deprecation
warnings about `local_lora_path`:

```
DeprecationWarning: The 'lora_local_path' attribute is deprecated
     and will be removed in a future version.
     Please use 'lora_path' instead.
```

The check used for emitting this warning was always True, even when the
parameter was not actually specified. It will always be in
`__struct_fields__`. We should be checking for a non-None value,
instead.

Signed-off-by: Russell Bryant <rbryant@redhat.com>

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/lora/request.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/lora/request.py b/vllm/lora/request.py
index c4b26dc92..5e3d2f0ed 100644
--- a/vllm/lora/request.py
+++ b/vllm/lora/request.py
@@ -31,7 +31,7 @@ class LoRARequest(
     base_model_name: Optional[str] = msgspec.field(default=None)
 
     def __post_init__(self):
-        if 'lora_local_path' in self.__struct_fields__:
+        if self.lora_local_path:
             warnings.warn(
                 "The 'lora_local_path' attribute is deprecated "
                 "and will be removed in a future version. "
-- 
GitLab


From abfcdcdf27eb54d2a2104b4bf5091a24ea4ff928 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 1 Feb 2025 23:43:20 -0800
Subject: [PATCH 05/65] [V1][Minor] Avoid frequently creating ConstantList
 (#12653)

A small optimization to avoid creating a new `ConstantList` every time `request.kv_block_hashes` is used.

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/request.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 2cfcd8b63..80160c673 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -64,6 +64,7 @@ class Request:
         # Cache the computed kv block hashes of the request to avoid
         # recomputing.
         self._kv_block_hashes: List[BlockHashType] = []
+        self.kv_block_hashes = ConstantList(self._kv_block_hashes)
 
         # Read-only views
         # Prevent directly appending to the these lists since
@@ -121,13 +122,9 @@ class Request:
         num_tokens = self.mm_positions[input_id]["length"]
         return num_tokens
 
-    @property
-    def kv_block_hashes(self) -> ConstantList["BlockHashType"]:
-        # Prevent directly appending to the kv_block_hashes.
-        return ConstantList(self._kv_block_hashes)
-
     def set_kv_block_hashes(self, value: List["BlockHashType"]) -> None:
         self._kv_block_hashes = value
+        self.kv_block_hashes = ConstantList(self._kv_block_hashes)
 
     def append_kv_block_hashes(self, block_hash: "BlockHashType") -> None:
         self._kv_block_hashes.append(block_hash)
-- 
GitLab


From f8ece6e17fbf4ff3a98d6d53cb3a03c50c02828c Mon Sep 17 00:00:00 2001
From: Shawn Du <shawnd200@outlook.com>
Date: Sun, 2 Feb 2025 16:40:58 +0800
Subject: [PATCH 06/65] [Core][v1] Unify allocating slots in prefill and decode
 in KV cache manager (#12608)

As mentioned in RFC https://github.com/vllm-project/vllm/issues/12254,
this PR achieves the task: combine allocate_slots and append_slots.

There should be no functionality change, except that in decode, also
raise exception when num_tokens is zero (like prefill), and change the
unit test case accordingly.

@comaniac @rickyyx @WoosukKwon @youkaichao @heheda12345 @simon-mo

---------

Signed-off-by: Shawn Du <shawnd200@outlook.com>
---
 tests/v1/core/test_prefix_caching.py |  24 ++--
 vllm/v1/core/kv_cache_manager.py     | 168 ++++++++++-----------------
 vllm/v1/core/scheduler.py            |   2 +-
 3 files changed, 78 insertions(+), 116 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index f434fa8c6..5c1cda285 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -164,7 +164,7 @@ def test_decode():
     req0.num_computed_tokens = 55
     for _ in range(4):
         req0.append_output_token_ids(8)
-    new_blocks = manager.append_slots(req0, 4)
+    new_blocks = manager.allocate_slots(req0, 4)
     assert new_blocks is not None and len(new_blocks) == 0
     assert manager.req_to_blocks[req0.request_id][-2].block_hash is None
 
@@ -175,7 +175,7 @@ def test_decode():
     # the preallocated block.
     for _ in range(5 + 10):
         req0.append_output_token_ids(7)
-    new_blocks = manager.append_slots(req0, 15)
+    new_blocks = manager.allocate_slots(req0, 15)
     assert new_blocks is not None and len(new_blocks) == 0
     assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None
 
@@ -185,7 +185,7 @@ def test_decode():
     # the preallocated block.
     for _ in range(6 + 11):
         req0.append_output_token_ids(12)
-    new_blocks = manager.append_slots(req0, 17)
+    new_blocks = manager.allocate_slots(req0, 17)
     # Plus one preallocated block.
     assert new_blocks is not None and len(new_blocks) == 2
 
@@ -395,12 +395,14 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
     req.num_computed_tokens = block_size
     assert len(blocks) == 1 + num_preallocated_blocks
 
-    # Assume all computed.
-    manager.append_slots(req, block_size * (len(blocks) - 1))
-    req.num_computed_tokens = block_size * len(blocks)
+    # Assume all computed, only when num_preallocate_tokens > 0, we need to
+    # consume the previously preallocated blocks.
+    if num_preallocated_blocks > 0:
+        manager.allocate_slots(req, block_size * (len(blocks) - 1))
+        req.num_computed_tokens = block_size * len(blocks)
 
     # Append 1 block.
-    blocks = manager.append_slots(req, block_size)
+    blocks = manager.allocate_slots(req, block_size)
     assert len(blocks) == 1 + num_preallocated_blocks
 
 
@@ -503,7 +505,7 @@ def test_mm_prefix_caching():
     # Append slots without allocating a new block.
     for _ in range(5):
         req0.append_output_token_ids(8)
-    new_blocks = manager.append_slots(req0, 5)
+    new_blocks = manager.allocate_slots(req0, 5)
     assert new_blocks is not None and len(new_blocks) == 0
 
     # The just completed block should have hashes with extra keys.
@@ -603,7 +605,7 @@ def test_reset_prefix_cache():
     unique_token_ids = [3] * 7
     all_token_ids = full_block_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids)
-    blocks = manager.allocate_slots(req0, 55, [])
+    blocks = manager.allocate_slots(req0, 55)
     assert [b.block_id for b in blocks] == [0, 1, 2, 3]
 
     unique_token_ids = [4] * 7
@@ -639,7 +641,7 @@ def test_uncache_blocks():
     )
 
     req0 = make_request("0", list(range(30)))
-    blocks = manager.allocate_slots(req0, 30, [])
+    blocks = manager.allocate_slots(req0, 30)
     assert [b.block_id for b in blocks] == [0, 1]
     assert len(manager.cached_block_hash_to_block) == 1
 
@@ -648,7 +650,7 @@ def test_uncache_blocks():
     # Simulate speculative tokens.
     for _ in range(5):
         req0.append_output_token_ids(8)
-    manager.append_slots(req0, 5)
+    manager.allocate_slots(req0, 5)
     assert len(manager.cached_block_hash_to_block) == 2
 
     # After sampling, assuming only 1 token is accepted.
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index d6c612f15..7176ec954 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,5 +1,5 @@
 from collections import defaultdict
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple
 
 from vllm.logger import init_logger
 from vllm.utils import cdiv
@@ -67,7 +67,8 @@ class KVCacheManager:
         # Mapping from request ID to blocks to track the blocks allocated
         # for each request, so that we can free the blocks when the request
         # is finished.
-        self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {}
+        self.req_to_blocks: DefaultDict[str,
+                                        List[KVCacheBlock]] = defaultdict(list)
 
     @property
     def usage(self) -> float:
@@ -115,33 +116,75 @@ class KVCacheManager:
         num_computed_tokens = len(computed_blocks) * self.block_size
         return computed_blocks, num_computed_tokens
 
-    def append_slots(
+    def allocate_slots(
         self,
         request: Request,
         num_tokens: int,
+        new_computed_blocks: Optional[List[KVCacheBlock]] = None
     ) -> Optional[List[KVCacheBlock]]:
-        """Append slots to the block table of the request.
-        We first append slots to already allocated blocks. If the allocated
-        blocks are not enough, we allocate new blocks.
+        """Add slots for a request with new tokens to append.
 
         Args:
-            request: The request to append slots.
-            num_tokens: The number of tokens to append.
+            request: The request to allocate slots.
+            num_tokens: The number of tokens to allocate. Note that this does
+                not include the tokens that have already been computed.
+            new_computed_blocks: A list of new computed blocks just hitting the
+                prefix caching.
+
+        Blocks layout:
+        -----------------------------------------------------------------------
+        | < computed > | < new computed > |    < new >    | < pre-allocated > |
+        -----------------------------------------------------------------------
+        |                  < required >                   |
+        --------------------------------------------------
+        |                    < full >                  |
+        ------------------------------------------------
+                                          | <new full> |
+                                          --------------
+        The following *_blocks are illustrated in this layout.
 
         Returns:
-            A list of new blocks if new blocks are allocated, or None
-            if new blocks are required but cannot be allocated.
+            A list of new allocated blocks.
         """
-        num_required_blocks = cdiv(request.num_computed_tokens + num_tokens,
+        if num_tokens == 0:
+            raise ValueError("num_tokens must be greater than 0")
+
+        new_computed_blocks = new_computed_blocks or []
+
+        # The number of computed tokens is the number of computed tokens plus
+        # the new prefix caching hits
+        num_computed_tokens = (request.num_computed_tokens +
+                               len(new_computed_blocks) * self.block_size)
+        num_required_blocks = cdiv(num_computed_tokens + num_tokens,
                                    self.block_size)
         req_blocks = self.req_to_blocks[request.request_id]
+        num_new_blocks = (num_required_blocks - len(req_blocks) -
+                          len(new_computed_blocks))
 
-        num_new_blocks = num_required_blocks - len(req_blocks)
-        if num_new_blocks > self.free_block_queue.num_free_blocks:
-            # Need to allocate new blocks due to insufficient pre-allocated
-            # slots, but we cannot allocate new blocks due to the limit.
+        # If a computed block of a request is an eviction candidate (in the
+        # free queue and ref_cnt == 0), it cannot be counted as a free block
+        # when allocating this request.
+        num_evictable_computed_blocks = sum(1 for blk in new_computed_blocks
+                                            if blk.ref_cnt == 0)
+        if (num_new_blocks > self.free_block_queue.num_free_blocks -
+                num_evictable_computed_blocks):
+            # Cannot allocate new blocks
             return None
 
+        # Touch the computed blocks to make sure they won't be evicted.
+        if self.enable_caching:
+            self._touch(new_computed_blocks)
+        else:
+            assert not new_computed_blocks, (
+                "Computed blocks should be empty when "
+                "prefix caching is disabled")
+
+        # Append the new computed blocks to the request blocks until now to
+        # avoid the case where the new blocks cannot be allocated.
+        req_blocks.extend(new_computed_blocks)
+
+        # Start to handle new blocks
+
         if num_new_blocks <= 0:
             # No new block is needed.
             new_blocks = []
@@ -160,112 +203,29 @@ class KVCacheManager:
             )
             assert num_new_blocks > 0
 
+            # Concatenate the computed block IDs and the new block IDs.
             new_blocks = self._get_new_blocks(num_new_blocks)
             req_blocks.extend(new_blocks)
 
         if not self.enable_caching:
             return new_blocks
 
-        num_computed_full_blocks = (request.num_computed_tokens //
-                                    self.block_size)
-
         # NOTE(rickyx): We are assuming the `num_tokens` are actual
         # tokens rather than lookahead slots (e.g. for speculative decoding).
         # TODO(rickyx): When supporting speculative decoding, we will need to
         # differentiate between them so that we can know how many blocks are
         # full after appending the actual tokens.
-        num_full_blocks_after_append = (request.num_computed_tokens +
-                                        num_tokens) // self.block_size
-        assert num_full_blocks_after_append <= len(req_blocks)
-
-        new_full_blocks = req_blocks[
-            num_computed_full_blocks:num_full_blocks_after_append]
-        if new_full_blocks:
-            self._cache_full_blocks(
-                request=request,
-                blk_start_idx=num_computed_full_blocks,
-                full_blocks=new_full_blocks,
-                prev_block=req_blocks[num_computed_full_blocks - 1]
-                if num_computed_full_blocks >= 1 else None,
-            )
-
-        return new_blocks
-
-    def allocate_slots(
-        self,
-        request: Request,
-        num_tokens: int,
-        computed_blocks: List[KVCacheBlock],
-    ) -> Optional[List[KVCacheBlock]]:
-        """Allocate slots for a new request.
-
-        Args:
-            request: The request to allocate slots.
-            num_tokens: The number of tokens to allocate. Note that this does
-                not include the tokens that have already been computed.
-            computed_blocks: A list of computed blocks.
-
-        Returns:
-            A list of new allocated blocks.
-        """
-        if num_tokens == 0:
-            raise ValueError(
-                f"num_tokens must be greater than 0, got {num_tokens}")
-
-        # If a computed block of a request is an eviction candidate (in the
-        # free queue and ref_cnt == 0), it cannot be counted as a free block
-        # when allocating this request.
-        num_evictable_computed_blocks = sum(1 for blk in computed_blocks
-                                            if blk.ref_cnt == 0)
-
-        num_required_blocks = cdiv(num_tokens, self.block_size)
-        if (num_required_blocks > self.free_block_queue.num_free_blocks -
-                num_evictable_computed_blocks):
-            # Cannot allocate new blocks.
-            return None
-
-        # Touch the computed blocks to make sure they won't be evicted.
-        if self.enable_caching:
-            self._touch(computed_blocks)
-        else:
-            assert not computed_blocks, (
-                "Computed blocks should be empty when "
-                "prefix caching is disabled")
-
-        # Determine the number of new blocks to allocate considering
-        # preallocated blocks.
-        num_new_blocks = min(
-            num_required_blocks + self.num_preallocate_blocks,
-            self.free_block_queue.num_free_blocks,
-            # Should not exceed the maximum number of blocks per request.
-            # This is especially because the block table has the shape
-            # [..., max_num_blocks_per_req].
-            # TODO(woosuk): Check and reject requests if
-            # num_prompt_tokens + max_tokens > max_model_len.
-            self.max_num_blocks_per_req - len(computed_blocks),
-        )
-        assert num_new_blocks > 0
-
-        # Concatenate the computed block IDs and the new block IDs.
-        new_blocks = self._get_new_blocks(num_new_blocks)
-        self.req_to_blocks[request.request_id] = computed_blocks + new_blocks
-
-        if not self.enable_caching:
-            return new_blocks
-
-        num_computed_tokens = len(computed_blocks) * self.block_size
         num_full_blocks = (num_computed_tokens + num_tokens) // self.block_size
-
-        new_full_blocks = self.req_to_blocks[
-            request.request_id][len(computed_blocks):num_full_blocks]
+        num_computed_full_blocks = num_computed_tokens // self.block_size
+        new_full_blocks = req_blocks[num_computed_full_blocks:num_full_blocks]
         if new_full_blocks:
             self._cache_full_blocks(
                 request=request,
-                blk_start_idx=len(computed_blocks),
+                blk_start_idx=num_computed_full_blocks,
                 # The new full blocks are the full blocks that are not computed.
                 full_blocks=new_full_blocks,
-                prev_block=computed_blocks[-1] if computed_blocks else None,
-            )
+                prev_block=(req_blocks[num_computed_full_blocks - 1]
+                            if num_computed_full_blocks > 0 else None))
 
         return new_blocks
 
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 910fc4ff4..27c9ac1ae 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -138,7 +138,7 @@ class Scheduler:
             assert num_new_tokens > 0
 
             while True:
-                new_blocks = self.kv_cache_manager.append_slots(
+                new_blocks = self.kv_cache_manager.allocate_slots(
                     request, num_new_tokens)
                 if new_blocks is None:
                     # The request cannot be scheduled.
-- 
GitLab


From f256ebe4df6757d76f1f1642d7e110268a2f8190 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Sun, 2 Feb 2025 18:17:26 +0800
Subject: [PATCH 07/65] [Hardware][Intel GPU] add XPU bf16 support (#12392)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 .../installation/gpu/xpu.inc.md               |  2 +-
 vllm/platforms/xpu.py                         | 23 ++++++++++++++++---
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index 411682678..ef02d9a07 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -36,7 +36,7 @@ VLLM_TARGET_DEVICE=xpu python setup.py install
 
 :::{note}
 - FP16 is the default data type in the current XPU backend. The BF16 data
-  type will be supported in the future.
+  type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet.
 :::
 
 ## Set up using Docker
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index a5ca77f57..039cdd5ad 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -66,9 +66,14 @@ class XPUPlatform(Platform):
         # check and update model config
         model_config = vllm_config.model_config
         if model_config.dtype == torch.bfloat16:
-            logger.warning(
-                "bfloat16 is not fully supported on XPU, casting to float16.")
-            model_config.dtype = torch.float16
+            bf16_supported = cls.device_support_bf16()
+            if not bf16_supported:
+                logger.warning(
+                    "bfloat16 is only supported on Intel Data Center GPU, "
+                    "Intel Arc GPU is not supported yet. Your device is %s,"
+                    "which is not supported. will fallback to float16",
+                    cls.get_device_name())
+                model_config.dtype = torch.float16
         if not model_config.enforce_eager:
             logger.warning(
                 "CUDA graph is not supported on XPU, fallback to the eager "
@@ -116,3 +121,15 @@ class XPUPlatform(Platform):
                                  ) -> float:
         torch.xpu.reset_peak_memory_stats(device)
         return torch.xpu.max_memory_allocated(device)
+
+    @classmethod
+    def device_support_bf16(cls) -> bool:
+        device_name = cls.get_device_name().lower()
+        if device_name.count("arc") > 0:
+            return False
+        elif device_name.count("data center gpu") > 0:
+            return True
+        else:
+            logger.warning("Unknown device name %s, always use float16",
+                           device_name)
+            return False
-- 
GitLab


From e489ad7a210f4234db696d1f2749d5f3662fa65b Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Sun, 2 Feb 2025 14:58:18 -0500
Subject: [PATCH 08/65] [Misc] Add SPDX-License-Identifier headers to python
 source files (#12628)

- **Add SPDX license headers to python source files**
- **Check for SPDX headers using pre-commit**

commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:18:24 2025 -0500

    Add SPDX license headers to python source files

This commit adds SPDX license headers to python source files as
recommended to
the project by the Linux Foundation. These headers provide a concise way
that is
both human and machine readable for communicating license information
for each
source file. It helps avoid any ambiguity about the license of the code
and can
    also be easily used by tools to help manage license compliance.

The Linux Foundation runs license scans against the codebase to help
ensure
    we are in compliance with the licenses of the code we use, including
dependencies. Having these headers in place helps that tool do its job.

    More information can be found on the SPDX site:

    - https://spdx.dev/learn/handling-license-info/

    Signed-off-by: Russell Bryant <rbryant@redhat.com>

commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:36:32 2025 -0500

    Check for SPDX headers using pre-commit

    Signed-off-by: Russell Bryant <rbryant@redhat.com>

---------

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .buildkite/check-wheel-size.py                |  2 +
 .buildkite/generate_index.py                  |  2 +
 .../test_lm_eval_correctness.py               |  1 +
 .../convert-results-json-to-markdown.py       |  2 +
 .../scripts/download-tokenizer.py             |  2 +
 .../scripts/generate-nightly-markdown.py      |  2 +
 .../scripts/get-lmdeploy-modelname.py         |  2 +
 .../scripts/summary-nightly-results.py        |  2 +
 .pre-commit-config.yaml                       |  6 ++-
 benchmarks/backend_request_func.py            |  2 +
 benchmarks/benchmark_guided.py                |  1 +
 benchmarks/benchmark_latency.py               |  1 +
 .../benchmark_long_document_qa_throughput.py  |  1 +
 benchmarks/benchmark_prefix_caching.py        |  1 +
 benchmarks/benchmark_prioritization.py        |  1 +
 benchmarks/benchmark_serving.py               |  1 +
 benchmarks/benchmark_serving_guided.py        |  1 +
 benchmarks/benchmark_throughput.py            |  1 +
 .../cutlass_benchmarks/sparse_benchmarks.py   |  2 +
 benchmarks/cutlass_benchmarks/utils.py        |  2 +
 .../cutlass_benchmarks/w8a8_benchmarks.py     |  2 +
 .../cutlass_benchmarks/weight_shapes.py       |  2 +
 .../disagg_prefill_proxy_server.py            |  2 +
 .../disagg_benchmarks/round_robin_proxy.py    |  2 +
 .../visualize_benchmark_results.py            |  2 +
 .../fused_kernels/layernorm_rms_benchmarks.py |  2 +
 benchmarks/kernels/benchmark_aqlm.py          |  2 +
 benchmarks/kernels/benchmark_layernorm.py     |  2 +
 benchmarks/kernels/benchmark_lora.py          |  2 +
 benchmarks/kernels/benchmark_machete.py       |  2 +
 benchmarks/kernels/benchmark_marlin.py        |  2 +
 benchmarks/kernels/benchmark_moe.py           |  2 +
 .../kernels/benchmark_paged_attention.py      |  2 +
 benchmarks/kernels/benchmark_quant.py         |  2 +
 benchmarks/kernels/benchmark_rmsnorm.py       |  2 +
 benchmarks/kernels/benchmark_rope.py          |  2 +
 benchmarks/kernels/benchmark_shapes.py        |  2 +
 benchmarks/kernels/graph_machete_bench.py     |  2 +
 benchmarks/kernels/utils.py                   |  2 +
 benchmarks/kernels/weight_shapes.py           |  2 +
 benchmarks/overheads/benchmark_hashing.py     |  2 +
 cmake/hipify.py                               |  2 +
 collect_env.py                                |  2 +
 .../vllm_cutlass_library_extension.py         |  2 +
 csrc/quantization/machete/generate.py         |  2 +
 docs/source/conf.py                           |  2 +
 docs/source/generate_examples.py              |  2 +
 examples/offline_inference/aqlm_example.py    |  2 +
 examples/offline_inference/arctic.py          |  2 +
 examples/offline_inference/audio_language.py  |  1 +
 examples/offline_inference/basic.py           |  2 +
 .../basic_with_model_default_sampling.py      |  2 +
 examples/offline_inference/chat.py            |  2 +
 examples/offline_inference/chat_with_tools.py |  2 +
 examples/offline_inference/classification.py  |  2 +
 examples/offline_inference/cli.py             |  2 +
 examples/offline_inference/cpu_offload.py     |  2 +
 examples/offline_inference/distributed.py     |  1 +
 examples/offline_inference/embedding.py       |  2 +
 examples/offline_inference/encoder_decoder.py |  1 +
 .../offline_inference/florence2_inference.py  |  1 +
 examples/offline_inference/gguf_inference.py  |  2 +
 .../offline_inference/llm_engine_example.py   |  2 +
 .../lora_with_quantization_inference.py       |  1 +
 examples/offline_inference/mlpspeculator.py   |  2 +
 .../offline_inference/multilora_inference.py  |  1 +
 examples/offline_inference/neuron.py          |  2 +
 .../neuron_int8_quantization.py               |  2 +
 examples/offline_inference/pixtral.py         |  2 +
 examples/offline_inference/prefix_caching.py  |  2 +
 examples/offline_inference/profiling.py       |  2 +
 .../profiling_tpu/profiling.py                |  2 +
 examples/offline_inference/rlhf.py            |  1 +
 .../offline_inference/save_sharded_state.py   |  1 +
 examples/offline_inference/scoring.py         |  2 +
 .../offline_inference/simple_profiling.py     |  2 +
 .../offline_inference/structured_outputs.py   |  2 +
 .../offline_inference/torchrun_example.py     |  1 +
 examples/offline_inference/tpu.py             |  2 +
 examples/offline_inference/vision_language.py |  1 +
 .../vision_language_embedding.py              |  1 +
 .../vision_language_multi_image.py            |  1 +
 examples/offline_inference/whisper.py         |  2 +
 examples/online_serving/api_client.py         |  1 +
 .../online_serving/cohere_rerank_client.py    |  1 +
 .../gradio_openai_chatbot_webserver.py        |  2 +
 examples/online_serving/gradio_webserver.py   |  2 +
 .../online_serving/jinaai_rerank_client.py    |  1 +
 .../openai_chat_completion_client.py          |  2 +
 ...i_chat_completion_client_for_multimodal.py |  1 +
 ...penai_chat_completion_client_with_tools.py |  1 +
 ...enai_chat_completion_structured_outputs.py |  2 +
 .../openai_chat_completion_with_reasoning.py  |  1 +
 ...hat_completion_with_reasoning_streaming.py |  1 +
 ...ai_chat_embedding_client_for_multimodal.py |  2 +
 .../openai_completion_client.py               |  2 +
 .../openai_cross_encoder_score.py             |  1 +
 .../online_serving/openai_embedding_client.py |  2 +
 .../online_serving/openai_pooling_client.py   |  1 +
 .../opentelemetry/dummy_client.py             |  2 +
 examples/other/tensorize_vllm_model.py        |  2 +
 find_cuda_init.py                             |  2 +
 python_only_dev.py                            |  2 +
 setup.py                                      |  2 +
 tests/async_engine/api_server_async_engine.py |  1 +
 tests/async_engine/test_api_server.py         |  2 +
 tests/async_engine/test_async_llm_engine.py   |  2 +
 tests/async_engine/test_request_tracker.py    |  2 +
 .../test_basic_correctness.py                 |  1 +
 .../basic_correctness/test_chunked_prefill.py |  1 +
 tests/basic_correctness/test_cpu_offload.py   |  2 +
 tests/basic_correctness/test_cumem.py         |  2 +
 tests/basic_correctness/test_preemption.py    |  1 +
 tests/compile/backend.py                      |  2 +
 tests/compile/piecewise/test_simple.py        |  1 +
 tests/compile/piecewise/test_toy_llama.py     |  1 +
 tests/compile/test_basic_correctness.py       |  2 +
 tests/compile/test_full_graph.py              |  2 +
 tests/compile/test_functionalization.py       |  2 +
 tests/compile/test_fusion.py                  |  2 +
 tests/compile/test_pass_manager.py            |  2 +
 tests/compile/test_wrapper.py                 |  2 +
 tests/compile/utils.py                        |  2 +
 tests/conftest.py                             |  2 +
 tests/core/block/conftest.py                  |  2 +
 tests/core/block/e2e/conftest.py              |  2 +
 tests/core/block/e2e/test_correctness.py      |  2 +
 .../e2e/test_correctness_sliding_window.py    |  2 +
 tests/core/block/test_block_manager.py        |  2 +
 tests/core/block/test_block_table.py          |  2 +
 tests/core/block/test_common.py               |  2 +
 .../block/test_cpu_gpu_block_allocator.py     |  2 +
 tests/core/block/test_naive_block.py          |  2 +
 tests/core/block/test_prefix_caching_block.py |  2 +
 tests/core/test_chunked_prefill_scheduler.py  |  2 +
 tests/core/test_num_computed_tokens_update.py |  2 +
 tests/core/test_scheduler.py                  |  2 +
 tests/core/test_scheduler_encoder_decoder.py  |  2 +
 tests/core/test_serialization.py              |  2 +
 tests/core/utils.py                           |  2 +
 tests/distributed/test_ca_buffer_sharing.py   |  2 +
 tests/distributed/test_comm_ops.py            |  1 +
 tests/distributed/test_custom_all_reduce.py   |  2 +
 tests/distributed/test_distributed_oot.py     |  2 +
 .../distributed/test_multi_node_assignment.py |  1 +
 tests/distributed/test_pipeline_parallel.py   |  1 +
 tests/distributed/test_pipeline_partition.py  |  2 +
 tests/distributed/test_pp_cudagraph.py        |  2 +
 tests/distributed/test_pynccl.py              |  2 +
 tests/distributed/test_same_node.py           |  2 +
 tests/distributed/test_shm_broadcast.py       |  2 +
 tests/distributed/test_torchrun_example.py    |  2 +
 tests/distributed/test_utils.py               |  2 +
 tests/encoder_decoder/test_e2e_correctness.py |  1 +
 .../output_processor/test_multi_step.py       |  2 +
 .../output_processor/test_stop_checker.py     |  2 +
 tests/engine/test_arg_utils.py                |  2 +
 tests/engine/test_computed_prefix_blocks.py   |  2 +
 tests/engine/test_custom_executor.py          |  2 +
 tests/engine/test_detokenization.py           |  2 +
 tests/engine/test_multiproc_workers.py        |  2 +
 tests/engine/test_short_mm_context.py         |  2 +
 tests/engine/test_skip_tokenizer_init.py      |  2 +
 tests/engine/test_stop_reason.py              |  1 +
 tests/engine/test_stop_strings.py             |  2 +
 tests/entrypoints/conftest.py                 |  2 +
 tests/entrypoints/llm/test_accuracy.py        |  1 +
 tests/entrypoints/llm/test_chat.py            |  2 +
 tests/entrypoints/llm/test_collective_rpc.py  |  2 +
 tests/entrypoints/llm/test_encode.py          |  2 +
 tests/entrypoints/llm/test_generate.py        |  2 +
 .../llm/test_generate_multiple_loras.py       |  2 +
 tests/entrypoints/llm/test_gpu_utilization.py |  2 +
 tests/entrypoints/llm/test_guided_generate.py |  2 +
 tests/entrypoints/llm/test_init.py            |  2 +
 tests/entrypoints/llm/test_lazy_outlines.py   |  2 +
 .../entrypoints/llm/test_prompt_validation.py |  2 +
 .../offline_mode/test_offline_mode.py         |  1 +
 .../test_deepseekr1_reasoning_parser.py       |  2 +
 .../openai/reasoning_parsers/utils.py         |  2 +
 tests/entrypoints/openai/test_accuracy.py     |  1 +
 .../openai/test_async_tokenization.py         |  2 +
 tests/entrypoints/openai/test_audio.py        |  2 +
 tests/entrypoints/openai/test_basic.py        |  2 +
 tests/entrypoints/openai/test_chat.py         |  2 +
 tests/entrypoints/openai/test_chat_echo.py    |  2 +
 .../entrypoints/openai/test_chat_template.py  |  2 +
 .../entrypoints/openai/test_chunked_prompt.py |  2 +
 tests/entrypoints/openai/test_cli_args.py     |  2 +
 tests/entrypoints/openai/test_completion.py   |  2 +
 tests/entrypoints/openai/test_embedding.py    |  2 +
 .../openai/test_encoder_decoder.py            |  2 +
 .../entrypoints/openai/test_lora_adapters.py  |  2 +
 tests/entrypoints/openai/test_metrics.py      |  2 +
 tests/entrypoints/openai/test_models.py       |  2 +
 .../openai/test_oot_registration.py           |  2 +
 tests/entrypoints/openai/test_pooling.py      |  2 +
 .../openai/test_prompt_validation.py          |  2 +
 tests/entrypoints/openai/test_rerank.py       |  2 +
 .../openai/test_return_tokens_as_ids.py       |  2 +
 tests/entrypoints/openai/test_root_path.py    |  2 +
 tests/entrypoints/openai/test_run_batch.py    |  2 +
 tests/entrypoints/openai/test_score.py        |  2 +
 tests/entrypoints/openai/test_serving_chat.py |  2 +
 .../entrypoints/openai/test_serving_models.py |  2 +
 tests/entrypoints/openai/test_shutdown.py     |  2 +
 tests/entrypoints/openai/test_tokenization.py |  2 +
 tests/entrypoints/openai/test_video.py        |  2 +
 tests/entrypoints/openai/test_vision.py       |  2 +
 .../openai/test_vision_embedding.py           |  2 +
 .../tool_parsers/test_pythonic_tool_parser.py |  2 +
 .../entrypoints/openai/tool_parsers/utils.py  |  2 +
 tests/entrypoints/test_chat_utils.py          |  2 +
 tests/kernels/allclose_default.py             |  2 +
 tests/kernels/conftest.py                     |  2 +
 tests/kernels/quant_utils.py                  |  2 +
 tests/kernels/test_activation.py              |  2 +
 tests/kernels/test_aqlm.py                    |  2 +
 tests/kernels/test_attention.py               |  2 +
 tests/kernels/test_attention_selector.py      |  2 +
 tests/kernels/test_awq.py                     |  2 +
 tests/kernels/test_awq_marlin.py              |  1 +
 tests/kernels/test_awq_triton.py              |  1 +
 tests/kernels/test_block_fp8.py               |  2 +
 tests/kernels/test_blocksparse_attention.py   |  2 +
 tests/kernels/test_cache.py                   |  2 +
 tests/kernels/test_cascade_flash_attn.py      |  2 +
 tests/kernels/test_causal_conv1d.py           |  2 +
 tests/kernels/test_cutlass.py                 |  1 +
 tests/kernels/test_cutlass_2of4_sparse.py     |  1 +
 tests/kernels/test_encoder_decoder_attn.py    |  1 +
 tests/kernels/test_flash_attn.py              |  2 +
 tests/kernels/test_flashinfer.py              |  2 +
 tests/kernels/test_fp8_quant.py               |  2 +
 tests/kernels/test_fused_quant_layernorm.py   |  2 +
 tests/kernels/test_ggml.py                    |  2 +
 tests/kernels/test_gguf.py                    |  2 +
 tests/kernels/test_gptq.py                    |  2 +
 tests/kernels/test_int8_quant.py              |  2 +
 tests/kernels/test_layernorm.py               |  2 +
 tests/kernels/test_machete_mm.py              |  1 +
 tests/kernels/test_mamba_ssm.py               |  2 +
 tests/kernels/test_marlin_gemm.py             |  1 +
 tests/kernels/test_mha_attn.py                |  1 +
 tests/kernels/test_moe.py                     |  1 +
 tests/kernels/test_permute_cols.py            |  2 +
 tests/kernels/test_pos_encoding.py            |  2 +
 tests/kernels/test_prefix_prefill.py          |  2 +
 tests/kernels/test_rotary_embedding.py        |  1 +
 tests/kernels/test_triton_decode_attention.py |  2 +
 tests/kernels/test_triton_scaled_mm.py        |  1 +
 tests/kernels/test_utils.py                   |  1 +
 tests/kernels/utils.py                        |  1 +
 tests/kv_transfer/disagg_test.py              |  2 +
 tests/kv_transfer/module_test.py              |  2 +
 tests/kv_transfer/test_lookup_buffer.py       |  2 +
 tests/kv_transfer/test_send_recv.py           |  2 +
 tests/lora/conftest.py                        |  2 +
 tests/lora/data/long_context_test_data.py     |  2 +
 tests/lora/test_baichuan.py                   |  2 +
 tests/lora/test_chatglm3_tp.py                |  2 +
 tests/lora/test_gemma.py                      |  2 +
 tests/lora/test_jamba.py                      |  2 +
 tests/lora/test_layers.py                     |  2 +
 tests/lora/test_llama_tp.py                   |  2 +
 tests/lora/test_long_context.py               |  2 +
 tests/lora/test_lora_bias_e2e.py              |  2 +
 tests/lora/test_lora_checkpoints.py           |  2 +
 tests/lora/test_lora_huggingface.py           |  2 +
 tests/lora/test_lora_manager.py               |  2 +
 tests/lora/test_minicpmv_tp.py                |  2 +
 tests/lora/test_mixtral.py                    |  2 +
 tests/lora/test_peft_helper.py                |  2 +
 tests/lora/test_phi.py                        |  2 +
 tests/lora/test_punica_ops_sizes.py           |  1 +
 tests/lora/test_punica_ops_variation.py       |  1 +
 tests/lora/test_quant_model.py                |  2 +
 tests/lora/test_qwen2vl.py                    |  2 +
 tests/lora/test_tokenizer_group.py            |  2 +
 tests/lora/test_utils.py                      |  2 +
 tests/lora/test_worker.py                     |  2 +
 tests/lora/utils.py                           |  2 +
 tests/metrics/test_metrics.py                 |  2 +
 tests/model_executor/conftest.py              |  2 +
 .../model_executor/test_enabled_custom_ops.py |  2 +
 .../model_executor/test_guided_processors.py  |  2 +
 .../test_model_load_with_params.py            |  2 +
 tests/model_executor/weight_utils.py          |  2 +
 .../audio_language/test_ultravox.py           |  2 +
 .../models/decoder_only/language/test_aqlm.py |  1 +
 .../models/decoder_only/language/test_fp8.py  |  2 +
 .../models/decoder_only/language/test_gguf.py |  1 +
 .../decoder_only/language/test_gptq_marlin.py |  1 +
 .../language/test_gptq_marlin_24.py           |  1 +
 .../decoder_only/language/test_granite.py     |  1 +
 .../decoder_only/language/test_jamba.py       |  2 +
 .../decoder_only/language/test_mamba.py       |  1 +
 .../decoder_only/language/test_mistral.py     |  1 +
 .../decoder_only/language/test_modelopt.py    |  2 +
 .../decoder_only/language/test_models.py      |  1 +
 .../decoder_only/language/test_phimoe.py      |  1 +
 .../decoder_only/vision_language/test_awq.py  |  2 +
 .../vision_language/test_h2ovl.py             |  2 +
 .../vision_language/test_intern_vit.py        |  2 +
 .../vision_language/test_models.py            |  1 +
 .../vision_language/test_phi3v.py             |  2 +
 .../vision_language/test_pixtral.py           |  1 +
 .../vision_language/test_qwen2_vl.py          |  2 +
 .../vision_language/vlm_utils/builders.py     |  1 +
 .../vlm_utils/case_filtering.py               |  1 +
 .../vision_language/vlm_utils/core.py         |  3 +-
 .../vlm_utils/custom_inputs.py                |  1 +
 .../vision_language/vlm_utils/model_utils.py  |  1 +
 .../vision_language/vlm_utils/runners.py      |  1 +
 .../vision_language/vlm_utils/types.py        |  1 +
 .../embedding/language/test_cls_models.py     |  1 +
 .../embedding/language/test_embedding.py      |  1 +
 .../models/embedding/language/test_gritlm.py  |  2 +
 .../models/embedding/language/test_scoring.py |  1 +
 tests/models/embedding/utils.py               |  2 +
 .../vision_language/test_dse_qwen2_vl.py      |  2 +
 .../vision_language/test_llava_next.py        |  2 +
 .../embedding/vision_language/test_phi3v.py   |  2 +
 .../audio_language/test_whisper.py            |  1 +
 .../encoder_decoder/language/test_bart.py     |  1 +
 .../vision_language/test_broadcast.py         |  2 +
 .../vision_language/test_florence2.py         |  2 +
 .../vision_language/test_mllama.py            |  2 +
 .../multimodal/processing/test_common.py      |  2 +
 .../multimodal/processing/test_idefics3.py    |  1 +
 .../multimodal/processing/test_internvl.py    |  1 +
 .../multimodal/processing/test_llava_next.py  |  2 +
 .../processing/test_llava_onevision.py        |  2 +
 .../multimodal/processing/test_phi3v.py       |  1 +
 .../multimodal/processing/test_qwen2_vl.py    |  2 +
 tests/models/registry.py                      |  2 +
 tests/models/test_initialization.py           |  2 +
 tests/models/test_oot_registration.py         |  2 +
 tests/models/test_registry.py                 |  2 +
 tests/models/utils.py                         |  2 +
 tests/mq_llm_engine/test_abort.py             |  1 +
 tests/mq_llm_engine/test_error_handling.py    |  1 +
 tests/mq_llm_engine/test_load.py              |  1 +
 tests/mq_llm_engine/utils.py                  |  2 +
 .../multi_step/test_correctness_async_llm.py  |  2 +
 tests/multi_step/test_correctness_llm.py      |  2 +
 tests/multimodal/test_inputs.py               |  2 +
 tests/multimodal/test_processing.py           |  2 +
 tests/multimodal/test_processor_kwargs.py     |  2 +
 tests/multimodal/test_utils.py                |  2 +
 tests/multimodal/utils.py                     |  2 +
 tests/neuron/test_prefix_prefill.py           |  2 +
 tests/plugins/vllm_add_dummy_model/setup.py   |  2 +
 .../vllm_add_dummy_model/__init__.py          |  2 +
 .../my_gemma_embedding.py                     |  2 +
 .../vllm_add_dummy_model/my_llava.py          |  2 +
 .../vllm_add_dummy_model/my_opt.py            |  2 +
 .../plugins/vllm_add_dummy_platform/setup.py  |  2 +
 .../vllm_add_dummy_platform/__init__.py       |  2 +
 .../dummy_attention_backend.py                |  2 +
 .../vllm_add_dummy_platform/dummy_platform.py |  2 +
 tests/plugins_tests/test_platform_plugins.py  |  2 +
 .../test_disable_sliding_window.py            |  1 +
 tests/prefix_caching/test_prefix_caching.py   |  1 +
 tests/prompt_adapter/test_bloom.py            |  2 +
 .../test_multi_adapter_inference.py           |  2 +
 tests/prompt_adapter/test_pa_lora.py          |  2 +
 tests/quantization/test_bitsandbytes.py       |  1 +
 tests/quantization/test_compressed_tensors.py |  1 +
 tests/quantization/test_configs.py            |  1 +
 tests/quantization/test_cpu_offload.py        |  2 +
 tests/quantization/test_experts_int8.py       |  2 +
 tests/quantization/test_fp8.py                |  1 +
 tests/quantization/test_ipex_quant.py         |  1 +
 tests/quantization/test_lm_head.py            |  1 +
 tests/quantization/test_quark.py              |  1 +
 .../test_register_quantization_config.py      |  1 +
 tests/quantization/utils.py                   |  2 +
 .../test_runai_model_streamer_loader.py       |  2 +
 .../runai_model_streamer/test_weight_utils.py |  2 +
 tests/samplers/test_beam_search.py            |  1 +
 tests/samplers/test_ignore_eos.py             |  1 +
 tests/samplers/test_logits_processor.py       |  2 +
 tests/samplers/test_logprobs.py               |  2 +
 tests/samplers/test_no_bad_words.py           |  1 +
 tests/samplers/test_ranks.py                  |  2 +
 tests/samplers/test_rejection_sampler.py      |  1 +
 tests/samplers/test_sampler.py                |  2 +
 tests/samplers/test_seeded_generate.py        |  1 +
 .../test_typical_acceptance_sampler.py        |  1 +
 tests/spec_decode/e2e/conftest.py             |  2 +
 tests/spec_decode/e2e/test_compatibility.py   |  2 +
 .../spec_decode/e2e/test_eagle_correctness.py |  1 +
 tests/spec_decode/e2e/test_integration.py     |  1 +
 .../e2e/test_integration_dist_tp2.py          |  1 +
 .../e2e/test_integration_dist_tp4.py          |  1 +
 tests/spec_decode/e2e/test_logprobs.py        |  2 +
 .../e2e/test_medusa_correctness.py            |  1 +
 tests/spec_decode/e2e/test_mlp_correctness.py |  1 +
 .../e2e/test_multistep_correctness.py         |  1 +
 .../spec_decode/e2e/test_ngram_correctness.py |  1 +
 tests/spec_decode/e2e/test_seed.py            |  2 +
 tests/spec_decode/test_batch_expansion.py     |  2 +
 tests/spec_decode/test_dynamic_spec_decode.py |  2 +
 tests/spec_decode/test_metrics.py             |  2 +
 tests/spec_decode/test_multi_step_worker.py   |  2 +
 tests/spec_decode/test_ngram_worker.py        |  2 +
 tests/spec_decode/test_scorer.py              |  2 +
 tests/spec_decode/test_spec_decode_worker.py  |  2 +
 tests/spec_decode/test_utils.py               |  2 +
 tests/spec_decode/utils.py                    |  2 +
 tests/standalone_tests/lazy_torch_compile.py  |  2 +
 tests/tensorizer_loader/conftest.py           |  2 +
 tests/tensorizer_loader/test_tensorizer.py    |  2 +
 tests/test_cache_block_hashing.py             |  1 +
 tests/test_config.py                          |  2 +
 tests/test_embedded_commit.py                 |  2 +
 tests/test_inputs.py                          |  2 +
 tests/test_logger.py                          |  2 +
 tests/test_logits_processor.py                |  2 +
 tests/test_regression.py                      |  1 +
 tests/test_sampling_params.py                 |  1 +
 tests/test_scalartype.py                      |  2 +
 tests/test_sequence.py                        |  2 +
 tests/test_sharded_state_loader.py            |  2 +
 tests/test_utils.py                           |  2 +
 tests/tokenization/test_cached_tokenizer.py   |  2 +
 tests/tokenization/test_detokenize.py         |  2 +
 tests/tokenization/test_get_eos.py            |  1 +
 tests/tokenization/test_tokenizer.py          |  2 +
 tests/tokenization/test_tokenizer_group.py    |  2 +
 tests/tool_use/conftest.py                    |  2 +
 ...est_chat_completion_request_validations.py |  2 +
 tests/tool_use/test_chat_completions.py       |  2 +
 tests/tool_use/test_jamba_tool_parser.py      |  2 +
 tests/tool_use/test_parallel_tool_calls.py    |  2 +
 tests/tool_use/test_tool_calls.py             |  2 +
 tests/tool_use/utils.py                       |  2 +
 tests/tpu/test_compilation.py                 |  2 +
 tests/tpu/test_custom_dispatcher.py           |  2 +
 tests/tpu/test_quantization_accuracy.py       |  2 +
 tests/tracing/test_tracing.py                 |  2 +
 tests/utils.py                                |  2 +
 tests/v1/core/test_kv_cache_utils.py          |  2 +
 tests/v1/core/test_prefix_caching.py          |  1 +
 tests/v1/e2e/test_cascade_attention.py        |  2 +
 tests/v1/engine/test_async_llm.py             |  2 +
 tests/v1/engine/test_engine_args.py           |  2 +
 tests/v1/engine/test_engine_core.py           |  2 +
 tests/v1/engine/test_engine_core_client.py    |  2 +
 tests/v1/engine/test_output_processor.py      |  2 +
 tests/v1/sample/test_sampler.py               |  2 +
 tests/v1/test_stats.py                        |  2 +
 tests/v1/test_utils.py                        |  2 +
 tests/v1/worker/test_gpu_input_batch.py       |  2 +
 tests/vllm_test_utils/setup.py                |  2 +
 .../vllm_test_utils/__init__.py               |  1 +
 .../vllm_test_utils/vllm_test_utils/blame.py  |  2 +
 .../vllm_test_utils/monitor.py                |  2 +
 tests/weight_loading/test_weight_loading.py   |  2 +
 .../test_encoder_decoder_model_runner.py      |  2 +
 tests/worker/test_model_input.py              |  2 +
 tests/worker/test_model_runner.py             |  2 +
 tests/worker/test_profile.py                  |  2 +
 tests/worker/test_swap.py                     |  2 +
 tools/check_spdx_header.py                    | 43 +++++++++++++++++++
 tools/profiler/print_layerwise_table.py       |  2 +
 tools/profiler/visualize_layerwise_profile.py |  2 +
 tools/report_build_time_ninja.py              |  2 +
 use_existing_torch.py                         |  2 +
 vllm/__init__.py                              |  1 +
 vllm/_custom_ops.py                           |  2 +
 vllm/_ipex_ops.py                             |  2 +
 vllm/adapter_commons/layers.py                |  2 +
 vllm/adapter_commons/models.py                |  2 +
 vllm/adapter_commons/request.py               |  2 +
 vllm/adapter_commons/utils.py                 |  2 +
 vllm/adapter_commons/worker_manager.py        |  2 +
 vllm/assets/audio.py                          |  2 +
 vllm/assets/base.py                           |  2 +
 vllm/assets/image.py                          |  2 +
 vllm/assets/video.py                          |  2 +
 vllm/attention/__init__.py                    |  2 +
 vllm/attention/backends/abstract.py           |  2 +
 vllm/attention/backends/blocksparse_attn.py   |  2 +
 vllm/attention/backends/flash_attn.py         |  1 +
 vllm/attention/backends/flashinfer.py         |  2 +
 vllm/attention/backends/hpu_attn.py           |  2 +
 vllm/attention/backends/ipex_attn.py          |  1 +
 vllm/attention/backends/mla/utils.py          |  2 +
 vllm/attention/backends/openvino.py           |  2 +
 vllm/attention/backends/pallas.py             |  2 +
 vllm/attention/backends/placeholder_attn.py   |  2 +
 vllm/attention/backends/rocm_flash_attn.py    |  1 +
 vllm/attention/backends/torch_sdpa.py         |  1 +
 vllm/attention/backends/triton_mla.py         |  2 +
 vllm/attention/backends/utils.py              |  1 +
 vllm/attention/backends/xformers.py           |  1 +
 vllm/attention/layer.py                       |  1 +
 .../blocksparse_attention_kernel.py           |  2 +
 .../ops/blocksparse_attention/interface.py    |  2 +
 .../ops/blocksparse_attention/utils.py        |  2 +
 vllm/attention/ops/hpu_paged_attn.py          |  2 +
 vllm/attention/ops/ipex_attn.py               |  2 +
 vllm/attention/ops/nki_flash_attn.py          |  2 +
 vllm/attention/ops/paged_attn.py              |  2 +
 vllm/attention/ops/prefix_prefill.py          |  2 +
 vllm/attention/ops/triton_decode_attention.py |  2 +
 vllm/attention/ops/triton_flash_attention.py  |  2 +
 vllm/attention/selector.py                    |  2 +
 vllm/beam_search.py                           |  2 +
 vllm/compilation/backends.py                  |  2 +
 vllm/compilation/counter.py                   |  2 +
 vllm/compilation/decorators.py                |  2 +
 vllm/compilation/fix_functionalization.py     |  2 +
 vllm/compilation/fusion.py                    |  2 +
 vllm/compilation/fx_utils.py                  |  2 +
 vllm/compilation/inductor_pass.py             |  2 +
 vllm/compilation/monitor.py                   |  2 +
 vllm/compilation/multi_output_match.py        |  2 +
 vllm/compilation/pass_manager.py              |  2 +
 vllm/compilation/reshapes.py                  |  2 +
 vllm/compilation/vllm_inductor_pass.py        |  2 +
 vllm/compilation/wrapper.py                   |  2 +
 vllm/config.py                                |  2 +
 vllm/connections.py                           |  2 +
 vllm/core/block/block_table.py                |  2 +
 vllm/core/block/common.py                     |  2 +
 vllm/core/block/cpu_gpu_block_allocator.py    |  2 +
 vllm/core/block/interfaces.py                 |  2 +
 vllm/core/block/naive_block.py                |  2 +
 vllm/core/block/prefix_caching_block.py       |  1 +
 vllm/core/block/utils.py                      |  1 +
 vllm/core/block_manager.py                    |  1 +
 vllm/core/evictor.py                          |  2 +
 vllm/core/interfaces.py                       |  2 +
 vllm/core/placeholder_block_space_manager.py  |  2 +
 vllm/core/scheduler.py                        |  2 +
 vllm/device_allocator/cumem.py                |  2 +
 vllm/distributed/__init__.py                  |  2 +
 vllm/distributed/communication_op.py          |  2 +
 .../device_communicators/cuda_wrapper.py      |  1 +
 .../device_communicators/custom_all_reduce.py |  2 +
 .../custom_all_reduce_utils.py                |  2 +
 .../device_communicators/hpu_communicator.py  |  2 +
 .../device_communicators/pynccl.py            |  2 +
 .../device_communicators/pynccl_wrapper.py    |  2 +
 .../device_communicators/shm_broadcast.py     |  2 +
 .../device_communicators/tpu_communicator.py  |  2 +
 .../device_communicators/xpu_communicator.py  |  2 +
 .../kv_transfer/kv_connector/base.py          |  1 +
 .../kv_transfer/kv_connector/factory.py       |  2 +
 .../kv_connector/simple_connector.py          |  1 +
 .../kv_transfer/kv_lookup_buffer/base.py      |  1 +
 .../kv_lookup_buffer/simple_buffer.py         |  1 +
 vllm/distributed/kv_transfer/kv_pipe/base.py  |  1 +
 .../kv_transfer/kv_pipe/mooncake_pipe.py      |  2 +
 .../kv_transfer/kv_pipe/pynccl_pipe.py        |  1 +
 .../kv_transfer/kv_transfer_agent.py          |  1 +
 vllm/distributed/parallel_state.py            |  2 +
 vllm/distributed/utils.py                     |  2 +
 vllm/engine/arg_utils.py                      |  2 +
 vllm/engine/async_llm_engine.py               |  2 +
 vllm/engine/async_timeout.py                  |  2 +
 vllm/engine/llm_engine.py                     |  2 +
 vllm/engine/metrics.py                        |  2 +
 vllm/engine/metrics_types.py                  |  1 +
 vllm/engine/multiprocessing/__init__.py       |  2 +
 vllm/engine/multiprocessing/client.py         |  2 +
 vllm/engine/multiprocessing/engine.py         |  2 +
 vllm/engine/output_processor/interfaces.py    |  2 +
 vllm/engine/output_processor/multi_step.py    |  2 +
 vllm/engine/output_processor/single_step.py   |  2 +
 vllm/engine/output_processor/stop_checker.py  |  2 +
 vllm/engine/output_processor/util.py          |  2 +
 vllm/engine/protocol.py                       |  2 +
 vllm/entrypoints/api_server.py                |  1 +
 vllm/entrypoints/chat_utils.py                |  2 +
 vllm/entrypoints/launcher.py                  |  2 +
 vllm/entrypoints/llm.py                       |  2 +
 vllm/entrypoints/logger.py                    |  2 +
 vllm/entrypoints/openai/api_server.py         |  2 +
 vllm/entrypoints/openai/cli_args.py           |  1 +
 vllm/entrypoints/openai/logits_processors.py  |  2 +
 vllm/entrypoints/openai/protocol.py           |  2 +
 .../openai/reasoning_parsers/__init__.py      |  2 +
 .../abs_reasoning_parsers.py                  |  2 +
 .../deepseek_r1_reasoning_parser.py           |  2 +
 vllm/entrypoints/openai/run_batch.py          |  2 +
 vllm/entrypoints/openai/serving_chat.py       |  2 +
 vllm/entrypoints/openai/serving_completion.py |  2 +
 vllm/entrypoints/openai/serving_embedding.py  |  2 +
 vllm/entrypoints/openai/serving_engine.py     |  2 +
 vllm/entrypoints/openai/serving_models.py     |  2 +
 vllm/entrypoints/openai/serving_pooling.py    |  2 +
 vllm/entrypoints/openai/serving_rerank.py     |  2 +
 vllm/entrypoints/openai/serving_score.py      |  2 +
 .../openai/serving_tokenization.py            |  2 +
 .../openai/tool_parsers/__init__.py           |  2 +
 .../tool_parsers/abstract_tool_parser.py      |  2 +
 .../granite_20b_fc_tool_parser.py             |  2 +
 .../tool_parsers/granite_tool_parser.py       |  2 +
 .../openai/tool_parsers/hermes_tool_parser.py |  2 +
 .../tool_parsers/internlm2_tool_parser.py     |  2 +
 .../openai/tool_parsers/jamba_tool_parser.py  |  2 +
 .../openai/tool_parsers/llama_tool_parser.py  |  2 +
 .../tool_parsers/mistral_tool_parser.py       |  2 +
 .../tool_parsers/pythonic_tool_parser.py      |  2 +
 vllm/entrypoints/openai/tool_parsers/utils.py |  2 +
 vllm/entrypoints/utils.py                     |  2 +
 vllm/envs.py                                  |  2 +
 vllm/executor/executor_base.py                |  2 +
 vllm/executor/mp_distributed_executor.py      |  2 +
 vllm/executor/msgspec_utils.py                |  2 +
 vllm/executor/multiproc_worker_utils.py       |  2 +
 vllm/executor/ray_distributed_executor.py     |  2 +
 vllm/executor/ray_utils.py                    |  2 +
 vllm/executor/uniproc_executor.py             |  2 +
 vllm/forward_context.py                       |  2 +
 vllm/inputs/__init__.py                       |  2 +
 vllm/inputs/data.py                           |  2 +
 vllm/inputs/parse.py                          |  2 +
 vllm/inputs/preprocess.py                     |  2 +
 vllm/inputs/registry.py                       |  2 +
 vllm/logger.py                                |  1 +
 vllm/logging_utils/__init__.py                |  2 +
 vllm/logging_utils/formatter.py               |  2 +
 vllm/logits_process.py                        |  2 +
 vllm/lora/fully_sharded_layers.py             |  2 +
 vllm/lora/layers.py                           |  2 +
 vllm/lora/lora.py                             |  2 +
 vllm/lora/models.py                           |  2 +
 vllm/lora/ops/torch_ops/__init__.py           |  2 +
 vllm/lora/ops/torch_ops/lora_ops.py           |  2 +
 vllm/lora/ops/triton_ops/__init__.py          |  2 +
 vllm/lora/ops/triton_ops/bgmv_expand.py       |  1 +
 vllm/lora/ops/triton_ops/bgmv_expand_slice.py |  1 +
 vllm/lora/ops/triton_ops/bgmv_shrink.py       |  1 +
 vllm/lora/ops/triton_ops/sgmv_expand.py       |  1 +
 vllm/lora/ops/triton_ops/sgmv_shrink.py       |  1 +
 vllm/lora/ops/triton_ops/utils.py             |  2 +
 vllm/lora/peft_helper.py                      |  2 +
 vllm/lora/punica_wrapper/__init__.py          |  2 +
 vllm/lora/punica_wrapper/punica_base.py       |  1 +
 vllm/lora/punica_wrapper/punica_cpu.py        |  2 +
 vllm/lora/punica_wrapper/punica_gpu.py        |  1 +
 vllm/lora/punica_wrapper/punica_hpu.py        |  2 +
 vllm/lora/punica_wrapper/punica_selector.py   |  2 +
 vllm/lora/punica_wrapper/utils.py             |  2 +
 vllm/lora/request.py                          |  2 +
 vllm/lora/utils.py                            |  2 +
 vllm/lora/worker_manager.py                   |  2 +
 vllm/model_executor/__init__.py               |  2 +
 vllm/model_executor/custom_op.py              |  2 +
 .../guided_decoding/__init__.py               |  2 +
 .../guided_decoding/guided_fields.py          |  2 +
 .../lm_format_enforcer_decoding.py            |  2 +
 .../guided_decoding/outlines_decoding.py      |  2 +
 .../outlines_logits_processors.py             |  2 +
 vllm/model_executor/guided_decoding/utils.py  |  2 +
 .../guided_decoding/xgrammar_decoding.py      |  2 +
 vllm/model_executor/layers/activation.py      |  1 +
 .../layers/fused_moe/__init__.py              |  2 +
 .../layers/fused_moe/fused_marlin_moe.py      |  1 +
 .../layers/fused_moe/fused_moe.py             |  1 +
 vllm/model_executor/layers/fused_moe/layer.py |  2 +
 .../layers/fused_moe/moe_pallas.py            |  2 +
 .../layers/fused_moe/moe_torch_iterative.py   |  2 +
 vllm/model_executor/layers/layernorm.py       |  1 +
 vllm/model_executor/layers/linear.py          |  2 +
 .../model_executor/layers/logits_processor.py |  1 +
 .../layers/mamba/mamba_mixer.py               |  2 +
 .../layers/mamba/ops/causal_conv1d.py         |  2 +
 .../layers/mamba/ops/mamba_ssm.py             |  2 +
 vllm/model_executor/layers/pooler.py          |  2 +
 .../layers/quantization/__init__.py           |  2 +
 .../layers/quantization/aqlm.py               |  2 +
 .../model_executor/layers/quantization/awq.py |  2 +
 .../layers/quantization/awq_marlin.py         |  2 +
 .../layers/quantization/awq_triton.py         |  2 +
 .../layers/quantization/base_config.py        |  2 +
 .../layers/quantization/bitsandbytes.py       |  2 +
 .../compressed_tensors/compressed_tensors.py  |  2 +
 .../compressed_tensors_moe.py                 |  2 +
 .../compressed_tensors/schemes/__init__.py    |  2 +
 .../schemes/compressed_tensors_24.py          |  2 +
 .../schemes/compressed_tensors_scheme.py      |  2 +
 .../schemes/compressed_tensors_w4a16_24.py    |  2 +
 .../schemes/compressed_tensors_w8a16_fp8.py   |  2 +
 .../schemes/compressed_tensors_w8a8_fp8.py    |  2 +
 .../schemes/compressed_tensors_w8a8_int8.py   |  2 +
 .../schemes/compressed_tensors_wNa16.py       |  2 +
 .../compressed_tensors/triton_scaled_mm.py    |  2 +
 .../quantization/compressed_tensors/utils.py  |  2 +
 .../layers/quantization/deepspeedfp.py        |  2 +
 .../layers/quantization/experts_int8.py       |  2 +
 .../layers/quantization/fbgemm_fp8.py         |  2 +
 .../model_executor/layers/quantization/fp8.py |  2 +
 .../layers/quantization/gguf.py               |  2 +
 .../layers/quantization/gptq.py               |  2 +
 .../layers/quantization/gptq_marlin.py        |  2 +
 .../layers/quantization/gptq_marlin_24.py     |  2 +
 .../layers/quantization/hqq_marlin.py         |  2 +
 .../layers/quantization/ipex_quant.py         |  2 +
 .../kernels/mixed_precision/MPLinearKernel.py |  2 +
 .../kernels/mixed_precision/__init__.py       |  2 +
 .../kernels/mixed_precision/exllama.py        |  2 +
 .../kernels/mixed_precision/machete.py        |  2 +
 .../kernels/mixed_precision/marlin.py         |  2 +
 .../kernels/scaled_mm/ScaledMMLinearKernel.py |  2 +
 .../kernels/scaled_mm/__init__.py             |  2 +
 .../quantization/kernels/scaled_mm/cutlass.py |  2 +
 .../quantization/kernels/scaled_mm/triton.py  |  2 +
 .../quantization/kernels/scaled_mm/xla.py     |  2 +
 .../layers/quantization/kv_cache.py           |  2 +
 .../layers/quantization/marlin.py             |  2 +
 .../layers/quantization/modelopt.py           |  2 +
 .../layers/quantization/moe_wna16.py          |  2 +
 .../layers/quantization/neuron_quant.py       |  2 +
 .../model_executor/layers/quantization/qqq.py |  2 +
 .../layers/quantization/quark/quark.py        |  2 +
 .../layers/quantization/quark/quark_moe.py    |  2 +
 .../quantization/quark/schemes/__init__.py    |  2 +
 .../quark/schemes/quark_scheme.py             |  2 +
 .../quark/schemes/quark_w8a8_fp8.py           |  2 +
 .../quark/schemes/quark_w8a8_int8.py          |  2 +
 .../layers/quantization/quark/utils.py        |  2 +
 .../layers/quantization/schema.py             |  1 +
 .../layers/quantization/tpu_int8.py           |  2 +
 .../layers/quantization/utils/__init__.py     |  2 +
 .../layers/quantization/utils/fp8_utils.py    |  2 +
 .../layers/quantization/utils/layer_utils.py  |  2 +
 .../quantization/utils/machete_utils.py       |  2 +
 .../layers/quantization/utils/marlin_utils.py |  2 +
 .../quantization/utils/marlin_utils_fp8.py    |  2 +
 .../quantization/utils/marlin_utils_test.py   |  1 +
 .../utils/marlin_utils_test_24.py             |  1 +
 .../utils/marlin_utils_test_qqq.py            |  2 +
 .../layers/quantization/utils/quant_utils.py  |  1 +
 .../layers/quantization/utils/w8a8_utils.py   |  2 +
 .../layers/rejection_sampler.py               |  2 +
 vllm/model_executor/layers/resampler.py       |  2 +
 .../model_executor/layers/rotary_embedding.py |  2 +
 vllm/model_executor/layers/sampler.py         |  1 +
 .../layers/spec_decode_base_sampler.py        |  2 +
 .../layers/typical_acceptance_sampler.py      |  2 +
 vllm/model_executor/layers/utils.py           |  1 +
 .../layers/vocab_parallel_embedding.py        |  2 +
 vllm/model_executor/model_loader/__init__.py  |  2 +
 vllm/model_executor/model_loader/loader.py    |  2 +
 vllm/model_executor/model_loader/neuron.py    |  1 +
 vllm/model_executor/model_loader/openvino.py  |  2 +
 .../model_executor/model_loader/tensorizer.py |  2 +
 vllm/model_executor/model_loader/utils.py     |  1 +
 .../model_loader/weight_utils.py              |  1 +
 vllm/model_executor/models/__init__.py        |  2 +
 vllm/model_executor/models/adapters.py        |  2 +
 vllm/model_executor/models/arctic.py          |  1 +
 vllm/model_executor/models/aria.py            |  2 +
 vllm/model_executor/models/baichuan.py        |  2 +
 vllm/model_executor/models/bart.py            |  2 +
 vllm/model_executor/models/bert.py            |  2 +
 vllm/model_executor/models/blip.py            |  1 +
 vllm/model_executor/models/blip2.py           |  2 +
 vllm/model_executor/models/bloom.py           |  2 +
 vllm/model_executor/models/chameleon.py       |  2 +
 vllm/model_executor/models/chatglm.py         |  2 +
 vllm/model_executor/models/clip.py            |  1 +
 vllm/model_executor/models/commandr.py        |  2 +
 vllm/model_executor/models/dbrx.py            |  2 +
 vllm/model_executor/models/decilm.py          |  2 +
 vllm/model_executor/models/deepseek.py        |  2 +
 vllm/model_executor/models/deepseek_v2.py     |  2 +
 vllm/model_executor/models/deepseek_v3.py     |  2 +
 vllm/model_executor/models/deepseek_vl2.py    |  2 +
 vllm/model_executor/models/eagle.py           |  2 +
 vllm/model_executor/models/exaone.py          |  2 +
 vllm/model_executor/models/fairseq2_llama.py  |  2 +
 vllm/model_executor/models/falcon.py          |  2 +
 vllm/model_executor/models/florence2.py       |  2 +
 vllm/model_executor/models/fuyu.py            |  2 +
 vllm/model_executor/models/gemma.py           |  2 +
 vllm/model_executor/models/gemma2.py          |  2 +
 vllm/model_executor/models/glm.py             |  1 +
 .../models/glm4_vision_encoder.py             |  2 +
 vllm/model_executor/models/gpt2.py            |  2 +
 vllm/model_executor/models/gpt_bigcode.py     |  2 +
 vllm/model_executor/models/gpt_j.py           |  2 +
 vllm/model_executor/models/gpt_neox.py        |  2 +
 vllm/model_executor/models/granite.py         |  2 +
 vllm/model_executor/models/granitemoe.py      |  2 +
 vllm/model_executor/models/gritlm.py          |  2 +
 vllm/model_executor/models/h2ovl.py           |  2 +
 .../models/idefics2_vision_model.py           |  2 +
 vllm/model_executor/models/idefics3.py        |  2 +
 vllm/model_executor/models/interfaces.py      |  2 +
 vllm/model_executor/models/interfaces_base.py |  2 +
 vllm/model_executor/models/intern_vit.py      |  2 +
 vllm/model_executor/models/internlm2.py       |  2 +
 vllm/model_executor/models/internlm2_ve.py    |  2 +
 vllm/model_executor/models/internvl.py        |  2 +
 vllm/model_executor/models/jais.py            |  2 +
 vllm/model_executor/models/jamba.py           |  1 +
 vllm/model_executor/models/llama.py           |  2 +
 vllm/model_executor/models/llava.py           |  2 +
 vllm/model_executor/models/llava_next.py      |  2 +
 .../model_executor/models/llava_next_video.py |  2 +
 vllm/model_executor/models/llava_onevision.py |  2 +
 vllm/model_executor/models/mamba.py           |  1 +
 vllm/model_executor/models/mamba_cache.py     |  2 +
 vllm/model_executor/models/medusa.py          |  2 +
 vllm/model_executor/models/minicpm.py         |  2 +
 vllm/model_executor/models/minicpm3.py        |  2 +
 vllm/model_executor/models/minicpmo.py        |  2 +
 vllm/model_executor/models/minicpmv.py        |  2 +
 vllm/model_executor/models/mixtral.py         |  2 +
 vllm/model_executor/models/mixtral_quant.py   |  2 +
 vllm/model_executor/models/mllama.py          |  2 +
 vllm/model_executor/models/mlp_speculator.py  |  2 +
 vllm/model_executor/models/module_mapping.py  |  2 +
 vllm/model_executor/models/molmo.py           |  2 +
 vllm/model_executor/models/mpt.py             |  2 +
 vllm/model_executor/models/nemotron.py        |  2 +
 vllm/model_executor/models/nvlm_d.py          |  2 +
 vllm/model_executor/models/olmo.py            |  2 +
 vllm/model_executor/models/olmo2.py           |  2 +
 vllm/model_executor/models/olmoe.py           |  2 +
 vllm/model_executor/models/opt.py             |  2 +
 vllm/model_executor/models/orion.py           |  2 +
 vllm/model_executor/models/paligemma.py       |  2 +
 vllm/model_executor/models/persimmon.py       |  2 +
 vllm/model_executor/models/phi.py             |  2 +
 vllm/model_executor/models/phi3.py            |  2 +
 vllm/model_executor/models/phi3_small.py      |  2 +
 vllm/model_executor/models/phi3v.py           |  2 +
 vllm/model_executor/models/phimoe.py          |  2 +
 vllm/model_executor/models/pixtral.py         |  2 +
 vllm/model_executor/models/qwen.py            |  2 +
 vllm/model_executor/models/qwen2.py           |  2 +
 vllm/model_executor/models/qwen2_audio.py     |  2 +
 vllm/model_executor/models/qwen2_moe.py       |  2 +
 vllm/model_executor/models/qwen2_rm.py        |  2 +
 vllm/model_executor/models/qwen2_vl.py        |  2 +
 vllm/model_executor/models/registry.py        |  1 +
 vllm/model_executor/models/roberta.py         |  2 +
 vllm/model_executor/models/siglip.py          |  1 +
 vllm/model_executor/models/solar.py           |  2 +
 vllm/model_executor/models/stablelm.py        |  2 +
 vllm/model_executor/models/starcoder2.py      |  2 +
 vllm/model_executor/models/telechat2.py       |  2 +
 vllm/model_executor/models/ultravox.py        |  2 +
 vllm/model_executor/models/utils.py           |  2 +
 vllm/model_executor/models/vision.py          |  2 +
 vllm/model_executor/models/whisper.py         |  2 +
 vllm/model_executor/parameter.py              |  2 +
 vllm/model_executor/pooling_metadata.py       |  2 +
 vllm/model_executor/sampling_metadata.py      |  2 +
 vllm/model_executor/utils.py                  |  1 +
 vllm/multimodal/__init__.py                   |  2 +
 vllm/multimodal/audio.py                      |  2 +
 vllm/multimodal/base.py                       |  2 +
 vllm/multimodal/hasher.py                     |  2 +
 vllm/multimodal/image.py                      |  2 +
 vllm/multimodal/inputs.py                     |  2 +
 vllm/multimodal/parse.py                      |  2 +
 vllm/multimodal/processing.py                 |  2 +
 vllm/multimodal/profiling.py                  |  2 +
 vllm/multimodal/registry.py                   |  2 +
 vllm/multimodal/utils.py                      |  2 +
 vllm/multimodal/video.py                      |  2 +
 vllm/outputs.py                               |  2 +
 vllm/platforms/__init__.py                    |  2 +
 vllm/platforms/cpu.py                         |  2 +
 vllm/platforms/cuda.py                        |  1 +
 vllm/platforms/hpu.py                         |  2 +
 vllm/platforms/interface.py                   |  2 +
 vllm/platforms/neuron.py                      |  2 +
 vllm/platforms/openvino.py                    |  2 +
 vllm/platforms/rocm.py                        |  2 +
 vllm/platforms/tpu.py                         |  2 +
 vllm/platforms/xpu.py                         |  2 +
 vllm/plugins/__init__.py                      |  2 +
 vllm/pooling_params.py                        |  2 +
 vllm/profiler/__init__.py                     |  2 +
 vllm/profiler/layerwise_profile.py            |  2 +
 vllm/profiler/utils.py                        |  2 +
 vllm/prompt_adapter/layers.py                 |  2 +
 vllm/prompt_adapter/models.py                 |  2 +
 vllm/prompt_adapter/request.py                |  2 +
 vllm/prompt_adapter/utils.py                  |  2 +
 vllm/prompt_adapter/worker_manager.py         |  2 +
 vllm/sampling_params.py                       |  1 +
 vllm/scalar_type.py                           |  2 +
 vllm/scripts.py                               |  2 +
 vllm/sequence.py                              |  1 +
 vllm/spec_decode/batch_expansion.py           |  2 +
 vllm/spec_decode/draft_model_runner.py        |  2 +
 vllm/spec_decode/interfaces.py                |  2 +
 vllm/spec_decode/medusa_worker.py             |  2 +
 vllm/spec_decode/metrics.py                   |  2 +
 vllm/spec_decode/mlp_speculator_worker.py     |  2 +
 vllm/spec_decode/mqa_scorer.py                |  2 +
 vllm/spec_decode/multi_step_worker.py         |  2 +
 vllm/spec_decode/ngram_worker.py              |  2 +
 vllm/spec_decode/proposer_worker_base.py      |  2 +
 .../spec_decode/smaller_tp_proposer_worker.py |  2 +
 vllm/spec_decode/spec_decode_worker.py        |  2 +
 vllm/spec_decode/target_model_runner.py       |  2 +
 vllm/spec_decode/top1_proposer.py             |  2 +
 vllm/spec_decode/util.py                      |  2 +
 vllm/tracing.py                               |  2 +
 vllm/transformers_utils/__init__.py           |  2 +
 vllm/transformers_utils/config.py             |  2 +
 vllm/transformers_utils/configs/__init__.py   |  2 +
 vllm/transformers_utils/configs/arctic.py     |  2 +
 vllm/transformers_utils/configs/chatglm.py    |  2 +
 vllm/transformers_utils/configs/cohere2.py    |  2 +
 vllm/transformers_utils/configs/dbrx.py       |  2 +
 .../configs/deepseek_vl2.py                   |  2 +
 vllm/transformers_utils/configs/eagle.py      |  2 +
 vllm/transformers_utils/configs/exaone.py     |  2 +
 vllm/transformers_utils/configs/falcon.py     |  2 +
 vllm/transformers_utils/configs/h2ovl.py      |  2 +
 vllm/transformers_utils/configs/internvl.py   |  2 +
 vllm/transformers_utils/configs/jais.py       |  2 +
 vllm/transformers_utils/configs/medusa.py     |  2 +
 vllm/transformers_utils/configs/mllama.py     |  2 +
 .../configs/mlp_speculator.py                 |  2 +
 vllm/transformers_utils/configs/mpt.py        |  2 +
 vllm/transformers_utils/configs/nemotron.py   |  2 +
 vllm/transformers_utils/configs/nvlm_d.py     |  2 +
 vllm/transformers_utils/configs/olmo2.py      |  2 +
 vllm/transformers_utils/configs/solar.py      |  2 +
 vllm/transformers_utils/configs/telechat2.py  |  2 +
 vllm/transformers_utils/configs/ultravox.py   |  2 +
 vllm/transformers_utils/detokenizer.py        |  2 +
 vllm/transformers_utils/detokenizer_utils.py  |  2 +
 vllm/transformers_utils/processor.py          |  2 +
 .../transformers_utils/processors/__init__.py |  2 +
 .../processors/deepseek_vl2.py                |  2 +
 vllm/transformers_utils/s3_utils.py           |  2 +
 vllm/transformers_utils/tokenizer.py          |  2 +
 .../tokenizer_group/__init__.py               |  2 +
 .../tokenizer_group/base_tokenizer_group.py   |  2 +
 .../tokenizer_group/ray_tokenizer_group.py    |  2 +
 .../tokenizer_group/tokenizer_group.py        |  2 +
 .../transformers_utils/tokenizers/__init__.py |  2 +
 vllm/transformers_utils/tokenizers/mistral.py |  2 +
 vllm/transformers_utils/utils.py              |  2 +
 vllm/triton_utils/__init__.py                 |  2 +
 vllm/triton_utils/custom_cache_manager.py     |  2 +
 vllm/triton_utils/importing.py                |  2 +
 vllm/usage/usage_lib.py                       |  2 +
 vllm/utils.py                                 |  2 +
 vllm/v1/attention/backends/flash_attn.py      |  1 +
 vllm/v1/core/encoder_cache_manager.py         |  2 +
 vllm/v1/core/kv_cache_manager.py              |  2 +
 vllm/v1/core/kv_cache_utils.py                |  1 +
 vllm/v1/core/scheduler.py                     |  2 +
 vllm/v1/engine/__init__.py                    |  2 +
 vllm/v1/engine/async_llm.py                   |  2 +
 vllm/v1/engine/core.py                        |  2 +
 vllm/v1/engine/core_client.py                 |  2 +
 vllm/v1/engine/detokenizer.py                 |  2 +
 vllm/v1/engine/llm_engine.py                  |  2 +
 vllm/v1/engine/mm_input_mapper.py             |  2 +
 vllm/v1/engine/output_processor.py            |  2 +
 vllm/v1/engine/processor.py                   |  2 +
 vllm/v1/executor/abstract.py                  |  2 +
 vllm/v1/executor/multiproc_executor.py        |  2 +
 vllm/v1/kv_cache_interface.py                 |  2 +
 vllm/v1/metrics/loggers.py                    |  2 +
 vllm/v1/metrics/stats.py                      |  2 +
 vllm/v1/outputs.py                            |  2 +
 vllm/v1/request.py                            |  2 +
 vllm/v1/sample/metadata.py                    |  2 +
 vllm/v1/sample/ops/penalties.py               |  2 +
 vllm/v1/sample/ops/topk_topp_sampler.py       |  2 +
 vllm/v1/sample/sampler.py                     |  1 +
 vllm/v1/serial_utils.py                       |  2 +
 vllm/v1/stats/common.py                       |  2 +
 vllm/v1/utils.py                              |  2 +
 vllm/v1/worker/block_table.py                 |  2 +
 vllm/v1/worker/gpu_input_batch.py             |  2 +
 vllm/v1/worker/gpu_model_runner.py            |  2 +
 vllm/v1/worker/gpu_worker.py                  |  1 +
 vllm/version.py                               |  2 +
 vllm/worker/cache_engine.py                   |  1 +
 vllm/worker/cpu_enc_dec_model_runner.py       |  2 +
 vllm/worker/cpu_model_runner.py               |  2 +
 vllm/worker/cpu_pooling_model_runner.py       |  2 +
 vllm/worker/cpu_worker.py                     |  1 +
 vllm/worker/enc_dec_model_runner.py           |  2 +
 vllm/worker/hpu_model_runner.py               |  2 +
 vllm/worker/hpu_worker.py                     |  2 +
 vllm/worker/model_runner.py                   |  2 +
 vllm/worker/model_runner_base.py              |  2 +
 vllm/worker/multi_step_model_runner.py        |  2 +
 vllm/worker/multi_step_tpu_worker.py          |  2 +
 vllm/worker/multi_step_worker.py              |  2 +
 vllm/worker/neuron_model_runner.py            |  2 +
 vllm/worker/neuron_worker.py                  |  1 +
 vllm/worker/openvino_model_runner.py          |  2 +
 vllm/worker/openvino_worker.py                |  1 +
 vllm/worker/pooling_model_runner.py           |  2 +
 vllm/worker/tpu_model_runner.py               |  2 +
 vllm/worker/tpu_worker.py                     |  2 +
 vllm/worker/utils.py                          |  1 +
 vllm/worker/worker.py                         |  1 +
 vllm/worker/worker_base.py                    |  2 +
 vllm/worker/xpu_model_runner.py               |  2 +
 vllm/worker/xpu_worker.py                     |  1 +
 1012 files changed, 1884 insertions(+), 2 deletions(-)
 create mode 100644 tools/check_spdx_header.py

diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index e29eb78a9..2e4aecdd3 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import sys
 import zipfile
diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
index 8350e2705..36e1b6c01 100644
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import os
 
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index afc935c1a..96e57dfd0 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 9d3646e2f..e031686c7 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import os
 from pathlib import Path
diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
index 68ac5909e..5e17b79d2 100644
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 
 from transformers import AutoTokenizer
diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
index 052060c57..0ff95a091 100644
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import json
 from pathlib import Path
diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
index 18bcc3a87..e5f179a0f 100644
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from lmdeploy.serve.openai.api_client import APIClient
 
 api_client = APIClient("http://localhost:8000")
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index 92d6fad73..62ee5e10b 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import datetime
 import json
 import os
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ae518e190..4568efcbb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -97,10 +97,14 @@ repos:
     language: system
     verbose: true
     stages: [commit-msg]
+  - id: check-spdx-header
+    name: Check SPDX headers
+    entry: python tools/check_spdx_header.py
+    language: python
+    types: [python]
   - id: suggestion
     name: Suggestion
     entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
     language: system
     verbose: true
     pass_filenames: false
-
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 0612e8778..364b087b8 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import os
 import sys
diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py
index 1a0e62598..2b41834ba 100644
--- a/benchmarks/benchmark_guided.py
+++ b/benchmarks/benchmark_guided.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Benchmark guided decoding throughput."""
 import argparse
 import dataclasses
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 77c4f6aa9..896312945 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Benchmark the latency of processing a single batch of requests."""
 import argparse
 import dataclasses
diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py
index 0b8fba381..21480578e 100644
--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Offline benchmark to test the long document QA throughput.
 
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 3ab421a89..23822856b 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Benchmark the efficiency of prefix caching.
 
diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
index e0c9e6a6d..a32065e4e 100644
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Benchmark offline prioritization."""
 import argparse
 import dataclasses
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 8b3212831..e934d228f 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 r"""Benchmark online serving throughput.
 
 On the server side, run one of the following commands:
diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py
index 4435d87e1..561e500d8 100644
--- a/benchmarks/benchmark_serving_guided.py
+++ b/benchmarks/benchmark_serving_guided.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 r"""Benchmark online serving throughput with guided decoding.
 
 On the server side, run one of the following commands:
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index c1b10b3cf..658eab6a2 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Benchmark offline inference throughput."""
 import argparse
 import dataclasses
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index 3d1c5e392..468a1b286 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import copy
 import itertools
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
index ef06fcd66..bab377800 100644
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Cutlass bench utils
 from typing import Iterable, Tuple
 
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index b87496ca3..6552b62da 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import copy
 import itertools
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
index d58fb0bf8..3d1121df4 100644
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
 # Example:
diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index 4058b1c0a..980e68668 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import aiohttp
diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py
index 6eb5f6398..c2ad4916b 100644
--- a/benchmarks/disagg_benchmarks/round_robin_proxy.py
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import itertools
 
diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
index e59d8bb0e..a7b4b9e8b 100644
--- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 
 import matplotlib.pyplot as plt
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
index ef91f9f8e..c56cc7438 100644
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pickle as pkl
 import time
 from dataclasses import dataclass
diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
index 601c4ea43..8d20b9156 100644
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import sys
 from typing import Optional
diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
index 7acea6087..d265c91bf 100644
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 
 import torch
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index e1f613e1d..ecde8fbaa 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import copy
 import json
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 46bab74ae..0301fee1a 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import copy
 import itertools
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 8fb44e3a3..c22e66c0b 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import torch
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 068830f02..a4a45c9cb 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import time
 from datetime import datetime
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 219013a38..daedaadb1 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 import time
 from typing import List, Optional
diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
index 1d6248344..0ddea9390 100644
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 
 import torch
diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
index baa5de0ff..dba153742 100644
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 from typing import Optional, Tuple, Union
 
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 250d50516..8ee0212a0 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from itertools import accumulate
 from typing import List, Optional
 
diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py
index 4eeeca35a..c375e61e4 100644
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 WEIGHT_SHAPES = {
     "ideal": [[4 * 256 * 32, 256 * 32]],
     "mistralai/Mistral-7B-v0.1/TP1": [
diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py
index 7d0bd8415..01d97d63d 100644
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 import pickle
 import re
diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py
index fee877b6f..728170748 100644
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import Any, Callable, Iterable, Optional
 
diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py
index 51f24f3ba..89b05d588 100644
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
 # Example:
diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py
index d16d6f9fb..5f94552e9 100644
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import cProfile
 import pstats
 
diff --git a/cmake/hipify.py b/cmake/hipify.py
index 340e41c81..2e0c8a172 100755
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 #!/usr/bin/env python3
 
 #
diff --git a/collect_env.py b/collect_env.py
index 254c19b19..0ec9d4cae 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa
 # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
 
diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
index b401736c9..d5a5e2ef8 100644
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 from typing import Dict, Union
 
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index a9b5ddf4c..02e59fe28 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 import math
 import os
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 6b0a1dad1..ea3b56e02 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Configuration file for the Sphinx documentation builder.
 #
 # This file only contains a selection of the most common options. For a full
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index ac592e223..9d4de18a3 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 import re
 from dataclasses import dataclass, field
diff --git a/examples/offline_inference/aqlm_example.py b/examples/offline_inference/aqlm_example.py
index 40f9a21ec..e8db3811f 100644
--- a/examples/offline_inference/aqlm_example.py
+++ b/examples/offline_inference/aqlm_example.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 from vllm.utils import FlexibleArgumentParser
 
diff --git a/examples/offline_inference/arctic.py b/examples/offline_inference/arctic.py
index 1fec3c99e..90c88446c 100644
--- a/examples/offline_inference/arctic.py
+++ b/examples/offline_inference/arctic.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 
 # Sample prompts.
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 5952ec13e..707ca9f87 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use vLLM for running offline inference 
 with the correct prompt format on audio language models.
diff --git a/examples/offline_inference/basic.py b/examples/offline_inference/basic.py
index 23cc6e853..a6e96c0bb 100644
--- a/examples/offline_inference/basic.py
+++ b/examples/offline_inference/basic.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 
 # Sample prompts.
diff --git a/examples/offline_inference/basic_with_model_default_sampling.py b/examples/offline_inference/basic_with_model_default_sampling.py
index 346bb80b1..80de9428f 100644
--- a/examples/offline_inference/basic_with_model_default_sampling.py
+++ b/examples/offline_inference/basic_with_model_default_sampling.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM
 
 # Sample prompts.
diff --git a/examples/offline_inference/chat.py b/examples/offline_inference/chat.py
index 8814f4d7b..dbc710cc8 100644
--- a/examples/offline_inference/chat.py
+++ b/examples/offline_inference/chat.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 
 llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py
index e69a6c067..15519bfed 100644
--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa
 import json
 import random
diff --git a/examples/offline_inference/classification.py b/examples/offline_inference/classification.py
index de539b639..4a364aeb8 100644
--- a/examples/offline_inference/classification.py
+++ b/examples/offline_inference/classification.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM
 
 # Sample prompts.
diff --git a/examples/offline_inference/cli.py b/examples/offline_inference/cli.py
index 391ac6b9b..bc6833b3f 100644
--- a/examples/offline_inference/cli.py
+++ b/examples/offline_inference/cli.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import asdict
 
 from vllm import LLM, SamplingParams
diff --git a/examples/offline_inference/cpu_offload.py b/examples/offline_inference/cpu_offload.py
index b152e5bc3..5511eb738 100644
--- a/examples/offline_inference/cpu_offload.py
+++ b/examples/offline_inference/cpu_offload.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 
 # Sample prompts.
diff --git a/examples/offline_inference/distributed.py b/examples/offline_inference/distributed.py
index 677127844..a2df41d4c 100644
--- a/examples/offline_inference/distributed.py
+++ b/examples/offline_inference/distributed.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use Ray Data for running offline batch inference
 distributively on a multi-nodes cluster.
diff --git a/examples/offline_inference/embedding.py b/examples/offline_inference/embedding.py
index 58d004313..f9399329d 100644
--- a/examples/offline_inference/embedding.py
+++ b/examples/offline_inference/embedding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM
 
 # Sample prompts.
diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
index 0f266d791..8765d1812 100644
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 '''
 Demonstrate prompting of text-to-text
 encoder/decoder models, specifically BART
diff --git a/examples/offline_inference/florence2_inference.py b/examples/offline_inference/florence2_inference.py
index c24096e90..58610b0fd 100644
--- a/examples/offline_inference/florence2_inference.py
+++ b/examples/offline_inference/florence2_inference.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 '''
 Demonstrate prompting of text-to-text
 encoder/decoder models, specifically Florence-2
diff --git a/examples/offline_inference/gguf_inference.py b/examples/offline_inference/gguf_inference.py
index aa05c4c0b..0447e74e0 100644
--- a/examples/offline_inference/gguf_inference.py
+++ b/examples/offline_inference/gguf_inference.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from huggingface_hub import hf_hub_download
 
 from vllm import LLM, SamplingParams
diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
index 60d894aae..501034c1c 100644
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 from typing import List, Tuple
 
diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
index 0c454ea50..de0734c1a 100644
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use LoRA with different quantization techniques
 for offline inference.
diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
index 8f0eb65e4..10d9de8cb 100644
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import gc
 import time
 from typing import List
diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index 043220d97..630fd1bf8 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use the multi-LoRA functionality
 for offline inference.
diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py
index f098c8e5f..517d1bfce 100644
--- a/examples/offline_inference/neuron.py
+++ b/examples/offline_inference/neuron.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 
 # Sample prompts.
diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py
index 8ec17e340..c899a01a0 100644
--- a/examples/offline_inference/neuron_int8_quantization.py
+++ b/examples/offline_inference/neuron_int8_quantization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 from vllm import LLM, SamplingParams
diff --git a/examples/offline_inference/pixtral.py b/examples/offline_inference/pixtral.py
index c12ff7021..760de1145 100644
--- a/examples/offline_inference/pixtral.py
+++ b/examples/offline_inference/pixtral.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa
 import argparse
 
diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py
index 67b755a15..4c326c417 100644
--- a/examples/offline_inference/prefix_caching.py
+++ b/examples/offline_inference/prefix_caching.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 
diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py
index 8a94b5c2a..c2e072fdd 100644
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import inspect
 import json
 import os
diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py
index d7423e6c6..b1fe829b3 100644
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import dataclasses
 import os
diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index 5c4918008..5000251c0 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 a simple demonstration of RLHF with vLLM, inspired by
 the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index 4207f8922..863276432 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Saves each worker's model state dict directly to a checkpoint, which enables a
 fast load path for large tensor-parallel models where each worker only needs to
diff --git a/examples/offline_inference/scoring.py b/examples/offline_inference/scoring.py
index 5da9e7109..7daa82b82 100644
--- a/examples/offline_inference/scoring.py
+++ b/examples/offline_inference/scoring.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM
 
 # Sample prompts.
diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py
index abcfa8e8f..b45954b3b 100644
--- a/examples/offline_inference/simple_profiling.py
+++ b/examples/offline_inference/simple_profiling.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import time
 
diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py
index 00d864606..38ffd7fb9 100644
--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from enum import Enum
 
 from pydantic import BaseModel
diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py
index b6de73eb7..35df60115 100644
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 experimental support for tensor-parallel inference with torchrun,
 see https://github.com/vllm-project/vllm/issues/11400 for
diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
index 251629b80..bd0e98462 100644
--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 
 prompts = [
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 38c2b13d3..65940b6ad 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use vLLM for running offline inference with
 the correct prompt format on vision language models for text generation.
diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
index 4ce3d496b..3075fbbfa 100644
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use vLLM for running offline inference with
 the correct prompt format on vision language models for multimodal embedding.
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 43c44fa86..601ac96e1 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use vLLM for running offline inference with
 multi-image input on vision language models for text generation,
diff --git a/examples/offline_inference/whisper.py b/examples/offline_inference/whisper.py
index 087ad4376..59c119a77 100644
--- a/examples/offline_inference/whisper.py
+++ b/examples/offline_inference/whisper.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 
 from vllm import LLM, SamplingParams
diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py
index 49a085feb..623e0d59a 100644
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Example Python client for `vllm.entrypoints.api_server`
 NOTE: The API server is used only for demonstration and simple performance
 benchmarks. It is not intended for production use.
diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py
index a07affe33..fc434ada1 100644
--- a/examples/online_serving/cohere_rerank_client.py
+++ b/examples/online_serving/cohere_rerank_client.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Example of using the OpenAI entrypoint's rerank API which is compatible with
 the Cohere SDK: https://github.com/cohere-ai/cohere-python
diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
index 8ceb8f68e..ee01e1eae 100644
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 
 import gradio as gr
diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py
index 54e907582..c619146b0 100644
--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import json
 
diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py
index bf4de76dd..3e760e171 100644
--- a/examples/online_serving/jinaai_rerank_client.py
+++ b/examples/online_serving/jinaai_rerank_client.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Example of using the OpenAI entrypoint's rerank API which is compatible with
 Jina and Cohere https://jina.ai/reranker
diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py
index bbada3891..a81562041 100644
--- a/examples/online_serving/openai_chat_completion_client.py
+++ b/examples/online_serving/openai_chat_completion_client.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from openai import OpenAI
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index 03cc037bb..d5f798a8d 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """An example showing how to use vLLM to serve multimodal models 
 and run online serving with OpenAI client.
 
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
index 2bbe42b6b..416fb61ca 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
 options enabled. For example:
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
index 8c059c7ca..cddd93180 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from enum import Enum
 
 from openai import OpenAI
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index 83e51a48b..a88c8adb5 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index 8c14aac6b..489bfcd5e 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
index a56e7429b..f49d7a228 100644
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import base64
 import io
diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
index 58519f978..06b93d7d1 100644
--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from openai import OpenAI
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
index 365a684d5..67c5fc91b 100644
--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Example online usage of Score API.
 
diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py
index 4bd7ca01d..cb1109974 100644
--- a/examples/online_serving/openai_embedding_client.py
+++ b/examples/online_serving/openai_embedding_client.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from openai import OpenAI
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
index 37ec8f2fb..e17f9c5ef 100644
--- a/examples/online_serving/openai_pooling_client.py
+++ b/examples/online_serving/openai_pooling_client.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Example online usage of Pooling API.
 
diff --git a/examples/online_serving/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py
index b1a2b3c3c..7a605f85b 100644
--- a/examples/online_serving/opentelemetry/dummy_client.py
+++ b/examples/online_serving/opentelemetry/dummy_client.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import requests
 from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
     OTLPSpanExporter)
diff --git a/examples/other/tensorize_vllm_model.py b/examples/other/tensorize_vllm_model.py
index 5fff1fdf5..68345e6cb 100644
--- a/examples/other/tensorize_vllm_model.py
+++ b/examples/other/tensorize_vllm_model.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import dataclasses
 import json
diff --git a/find_cuda_init.py b/find_cuda_init.py
index 51db23102..0d13b2f86 100644
--- a/find_cuda_init.py
+++ b/find_cuda_init.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import importlib
 import traceback
 from typing import Callable
diff --git a/python_only_dev.py b/python_only_dev.py
index 7d95ac96e..a303697b7 100644
--- a/python_only_dev.py
+++ b/python_only_dev.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation).
 
 TL;DR:
diff --git a/setup.py b/setup.py
index 50a2392a4..50265d46e 100755
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import ctypes
 import importlib.util
 import logging
diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
index a3c9d5c6e..d9ac61164 100644
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """vllm.entrypoints.api_server with some extra logging for testing."""
 from typing import Any, Dict, Iterable
 
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 91ac35dd6..77f3fb002 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import subprocess
 import sys
 import time
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 8a04693ba..ca29abc92 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 import uuid
diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
index 5668cc30d..fd6d89d4e 100644
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.engine.async_llm_engine import RequestTracker
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 232850406..2792dfde7 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the short outputs of HF and vLLM when using greedy sampling.
 
 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 469d18a4d..cefd54d1c 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM when using greedy sampling.
 
 It tests chunked prefill. Chunked prefill can be enabled by
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index d7f36a781..b4d558ce2 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from ..utils import compare_two_settings
 
 
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 53f4ef08f..da9239b09 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from vllm import LLM, SamplingParams
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 4b27dcbc8..6aaec6eef 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the short outputs of HF and vLLM when using greedy sampling.
 
 VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
diff --git a/tests/compile/backend.py b/tests/compile/backend.py
index 8fa10e5bd..74bc58a2d 100644
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from copy import deepcopy
 from typing import Callable, Union
 
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index aa1152481..9d633ad25 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Test the piecewise compilation with a simple model so that we
 can exactly calculate the expected output and side effects.
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index d4ede4d23..0404722ba 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Test the piecewise compilation with a simple model, comparing the output
 with and without the piecewise compilation.
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 1945479fc..d7acec690 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import Dict, List, Optional
 
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 4dfdfe21a..6e83fa368 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.config import CompilationLevel
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index ea3aaee95..8f5040522 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index b4266a4a7..c14f0caab 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 from compressed_tensors.quantization import FP8_DTYPE
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index 03e753509..70920ab10 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pickle
 
 import pytest
diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py
index 74f66baaa..0934c6113 100644
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional
 
 import torch
diff --git a/tests/compile/utils.py b/tests/compile/utils.py
index 7c92d165d..e4a88584e 100644
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import torch
diff --git a/tests/conftest.py b/tests/conftest.py
index 279c1bf9a..85dd5bcb0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import os
 import tempfile
diff --git a/tests/core/block/conftest.py b/tests/core/block/conftest.py
index 0464d6a74..b7a9863f4 100644
--- a/tests/core/block/conftest.py
+++ b/tests/core/block/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 
diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py
index 70577ec05..7d3ccaada 100644
--- a/tests/core/block/e2e/conftest.py
+++ b/tests/core/block/e2e/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, Iterable, Optional
 
 import pytest
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 86502f613..e9b537ed5 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from itertools import cycle
 
 import pytest
diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
index 415d0bd82..c874608e4 100644
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import List
 
diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py
index cfd749ad5..68d9618ae 100644
--- a/tests/core/block/test_block_manager.py
+++ b/tests/core/block/test_block_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
index e2391a568..d8cf0bec7 100644
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/core/block/test_common.py b/tests/core/block/test_common.py
index cfdd3582e..202608730 100644
--- a/tests/core/block/test_common.py
+++ b/tests/core/block/test_common.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 
 import pytest
diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py
index a9e38d404..a1414edd9 100644
--- a/tests/core/block/test_cpu_gpu_block_allocator.py
+++ b/tests/core/block/test_cpu_gpu_block_allocator.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py
index 10d5964dc..0ca2a0b80 100644
--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional
 
 import pytest
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index 6642174c1..771627a57 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 import random
 from typing import List, Optional
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index eaaf004df..8da25aea4 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 from unittest.mock import MagicMock
 
diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py
index bd4accab7..a4a901444 100644
--- a/tests/core/test_num_computed_tokens_update.py
+++ b/tests/core/test_num_computed_tokens_update.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from tests.conftest import VllmRunner
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 8f6de84e5..dcc97ebaa 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from collections import deque
 from typing import List, Set, Tuple
diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py
index 16bea5493..a4e3c73a5 100644
--- a/tests/core/test_scheduler_encoder_decoder.py
+++ b/tests/core/test_scheduler_encoder_decoder.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest  # noqa
diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py
index d604e5250..64b3e148e 100644
--- a/tests/core/test_serialization.py
+++ b/tests/core/test_serialization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import msgspec
 
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
diff --git a/tests/core/utils.py b/tests/core/utils.py
index 16703cd19..fb77dccce 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from collections import defaultdict
 from typing import Any, Dict, List, Optional
diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py
index fc4043cd3..72e7ebdb7 100644
--- a/tests/distributed/test_ca_buffer_sharing.py
+++ b/tests/distributed/test_ca_buffer_sharing.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # can only run on machines with p2p access across GPUs
 # can only run with torchrun:
 # torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index d01f18752..bc916e8de 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test the communication operators.
 
 Run `pytest tests/distributed/test_comm_ops.py`.
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 4072616fd..46887bca4 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import random
 
diff --git a/tests/distributed/test_distributed_oot.py b/tests/distributed/test_distributed_oot.py
index 62e77a2f7..4b0c65d1d 100644
--- a/tests/distributed/test_distributed_oot.py
+++ b/tests/distributed/test_distributed_oot.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from ..entrypoints.openai.test_oot_registration import (
     run_and_test_dummy_opt_api_server)
 
diff --git a/tests/distributed/test_multi_node_assignment.py b/tests/distributed/test_multi_node_assignment.py
index 9f9c0ff07..c86d2d8a0 100644
--- a/tests/distributed/test_multi_node_assignment.py
+++ b/tests/distributed/test_multi_node_assignment.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Make sure ray assigns GPU workers to the correct node.
 
 Run:
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index ddbf40f08..5b6741d74 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 WARNING: This test runs in both single-node (4 GPUs) and multi-node
  (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py
index 2d4d07dd2..3ed104820 100644
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import pytest
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index 4912858d8..3bc85b05e 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import pytest
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index a8571a115..4c42a0ed8 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import multiprocessing
 import os
 from typing import Dict, List
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index 62311a626..9b1bbd6e5 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import torch.distributed as dist
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index 723872682..59fa7cc9f 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import multiprocessing
 import random
 import time
diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
index 7aa03d7f0..a092a548a 100644
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # unit test for `examples/offline_inference/torchrun_example.py`
 
 import random
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 5fb1ae7b2..4432950f2 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import socket
 
 import pytest
diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py
index fa5d6a69a..d0e4f8625 100644
--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """E2E tests to verify the correctness of the encoder-decoder framework
 
 Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/output_processor/test_multi_step.py
index 88f3fad4c..3ba3c4ec5 100644
--- a/tests/engine/output_processor/test_multi_step.py
+++ b/tests/engine/output_processor/test_multi_step.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from unittest.mock import MagicMock
 
diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py
index cc14e8cbf..e9ad8d161 100644
--- a/tests/engine/output_processor/test_stop_checker.py
+++ b/tests/engine/output_processor/test_stop_checker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from unittest.mock import MagicMock
 
 import pytest
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 4e269de9f..8698d124e 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from argparse import ArgumentTypeError
 
 import pytest
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
index ed35212cc..dca8fa602 100644
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.engine.arg_utils import EngineArgs
diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py
index 0e33f3662..3e77faecb 100644
--- a/tests/engine/test_custom_executor.py
+++ b/tests/engine/test_custom_executor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
diff --git a/tests/engine/test_detokenization.py b/tests/engine/test_detokenization.py
index f77f6d072..742176ea8 100644
--- a/tests/engine/test_detokenization.py
+++ b/tests/engine/test_detokenization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.entrypoints.llm import LLM
diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py
index 04505fcaa..f1fe58e35 100644
--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py
index a6ba7a131..d5111e3fd 100644
--- a/tests/engine/test_short_mm_context.py
+++ b/tests/engine/test_short_mm_context.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from ..conftest import IMAGE_ASSETS
diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
index b8818af56..655c8232a 100644
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.entrypoints.llm import LLM
diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py
index b0bd6c4aa..a50b38804 100644
--- a/tests/engine/test_stop_reason.py
+++ b/tests/engine/test_stop_reason.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test the different finish_reason="stop" situations during generation:
     1. One of the provided stop strings
     2. One of the provided stop tokens
diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py
index 499935620..0f633bb26 100644
--- a/tests/engine/test_stop_strings.py
+++ b/tests/engine/test_stop_strings.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, List, Optional
 
 import pytest
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index ef74062ce..b00e168db 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 6bf7190a6..29ff00df6 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This file test accuracy of the vLLM server via LMEval.
 It uses local-completions, which interacts with vLLM
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index fc66386fd..77c80b2f8 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
index 22473ce27..39d4810de 100644
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm import LLM
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index 3906ad766..ebec8baba 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import weakref
 from typing import List
 
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
index 7d2b37775..4c78c2c8e 100644
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import weakref
 from typing import List
 
diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
index eb2113692..90e1d5814 100644
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import weakref
 
 import pytest
diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py
index c2dab300e..c2b4a9358 100644
--- a/tests/entrypoints/llm/test_gpu_utilization.py
+++ b/tests/entrypoints/llm/test_gpu_utilization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 
 
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index ccb9906fc..932a35a99 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import re
 import weakref
diff --git a/tests/entrypoints/llm/test_init.py b/tests/entrypoints/llm/test_init.py
index c9a4ad44f..925bf56a9 100644
--- a/tests/entrypoints/llm/test_init.py
+++ b/tests/entrypoints/llm/test_init.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm import LLM
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index bf609b38a..b1f9ae14d 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import sys
 from contextlib import nullcontext
 
diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py
index ee7010a23..f2c145fa3 100644
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm import LLM
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index 65699e609..eac76f2ba 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for HF_HUB_OFFLINE mode"""
 import importlib
 import sys
diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
index 4607e4dfe..f7b81be48 100644
--- a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
+++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/entrypoints/openai/reasoning_parsers/utils.py b/tests/entrypoints/openai/reasoning_parsers/utils.py
index ac73ad50a..2157e0595 100644
--- a/tests/entrypoints/openai/reasoning_parsers/utils.py
+++ b/tests/entrypoints/openai/reasoning_parsers/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple, Union
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
diff --git a/tests/entrypoints/openai/test_accuracy.py b/tests/entrypoints/openai/test_accuracy.py
index b1d4461d1..df25780cd 100644
--- a/tests/entrypoints/openai/test_accuracy.py
+++ b/tests/entrypoints/openai/test_accuracy.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This file test accuracy of the vLLM server via LMEval.
 It uses local-completions, which interacts with vLLM
diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py
index fcce8b46c..1f7ba0da4 100644
--- a/tests/entrypoints/openai/test_async_tokenization.py
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import contextlib
 import random
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 1116c0da1..6e206dfd9 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List
 
 import openai
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index 547c1fd02..0d44a7611 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from http import HTTPStatus
 from typing import List
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 5e6499d8f..4b5ad55c5 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # imports for guided decoding tests
 import json
 import re
diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py
index 223ac5b41..3e76158a8 100644
--- a/tests/entrypoints/openai/test_chat_echo.py
+++ b/tests/entrypoints/openai/test_chat_echo.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import NamedTuple
 
 import openai  # use the official client for correctness check
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index e1e1dcff7..255aba139 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py
index 61d663651..0419395f1 100644
--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index 01bcd78aa..2f065ec10 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 
 import pytest
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 183d900c4..28671cc27 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # imports for guided decoding tests
 import json
 import re
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index b52a5b28c..e86ea87dd 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import base64
 
 import numpy as np
diff --git a/tests/entrypoints/openai/test_encoder_decoder.py b/tests/entrypoints/openai/test_encoder_decoder.py
index 51eba694e..52b4df9ce 100644
--- a/tests/entrypoints/openai/test_encoder_decoder.py
+++ b/tests/entrypoints/openai/test_encoder_decoder.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import openai
 import pytest
 import pytest_asyncio
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index 6ff99f6fa..1a62157ac 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import json
 import shutil
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 941f46571..a9134be62 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import subprocess
 import sys
 import tempfile
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index ae5bf404d..3d4f1cde2 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py
index b25cb1d0e..a1b7a205a 100644
--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from ...utils import VLLM_PATH, RemoteOpenAIServer
 
 chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py
index 9c4923939..11d3bfafa 100644
--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/openai/test_pooling.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import base64
 
 import numpy as np
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index 1ae64ef49..64a1eb6a6 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # imports for guided decoding tests
 import re
 
diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py
index cfd8f3313..4c9774a73 100644
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import requests
 
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index 99f6da160..9b33eddae 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Separate these tests out from test_completion and test_chat, because they
 # require launching a second server with a different flag. Running both servers
 # at the same time on a single node will OOM.
diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py
index 20f796061..ad8159afc 100644
--- a/tests/entrypoints/openai/test_root_path.py
+++ b/tests/entrypoints/openai/test_root_path.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import contextlib
 import os
 from typing import Any, List, NamedTuple
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index 1f8a56bb4..db049ee2b 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import subprocess
 import sys
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index 0d19615bc..bcbcb5702 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import requests
 
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index e88d6c3c6..1e7dbaf60 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from contextlib import suppress
 from dataclasses import dataclass
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index 657ea2021..70ca8507a 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from http import HTTPStatus
 from unittest.mock import MagicMock
 
diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py
index 090523a83..5edf85ab5 100644
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import openai
 import pytest
 
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index b1956a8cb..663b72242 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import pytest_asyncio
 import requests
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index e73449e40..ab9285407 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List
 
 import openai
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 5f070ba3b..029c9b038 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List
 
 import openai
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index c851539c6..f2ff4a0b0 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict
 
 import pytest
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
index 47b0b6bb8..788efa86b 100644
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 from unittest.mock import MagicMock
 
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
index f0a2a32c1..57ec98653 100644
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Iterable, List, Tuple, Union
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 513b466c1..737f73309 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import warnings
 from typing import Optional
 
diff --git a/tests/kernels/allclose_default.py b/tests/kernels/allclose_default.py
index 175cfe82f..97ceffab4 100644
--- a/tests/kernels/allclose_default.py
+++ b/tests/kernels/allclose_default.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 # Reference default values of atol and rtol are from
diff --git a/tests/kernels/conftest.py b/tests/kernels/conftest.py
index 4f2f9cc3d..4f04ec947 100644
--- a/tests/kernels/conftest.py
+++ b/tests/kernels/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.utils import (create_kv_caches_with_random,
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index f2358940f..34dcf91c7 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Tuple, Union
 
 import torch
diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
index dac26efe8..2e70b1db3 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import Type
 
diff --git a/tests/kernels/test_aqlm.py b/tests/kernels/test_aqlm.py
index 860fb66b1..7d3617281 100644
--- a/tests/kernels/test_aqlm.py
+++ b/tests/kernels/test_aqlm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from tests.kernels.utils import opcheck
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 574a0f223..b667d8d9e 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import List, Optional, Tuple
 
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 492acb91e..0e8743731 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from unittest.mock import Mock, patch
 
 import pytest
diff --git a/tests/kernels/test_awq.py b/tests/kernels/test_awq.py
index aa7a43085..ace75a336 100644
--- a/tests/kernels/test_awq.py
+++ b/tests/kernels/test_awq.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import pytest
diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py
index 238d6426b..67595010c 100644
--- a/tests/kernels/test_awq_marlin.py
+++ b/tests/kernels/test_awq_marlin.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test AWQ with fused MoE Marlin kernels.
 
 Run `pytest tests/kernels/test_awq_marlin.py`.
diff --git a/tests/kernels/test_awq_triton.py b/tests/kernels/test_awq_triton.py
index 406a0c8dd..3fc3feaf4 100644
--- a/tests/kernels/test_awq_triton.py
+++ b/tests/kernels/test_awq_triton.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for the AWQ Triton kernel.
 
 Run `pytest tests/kernels/test_awq_triton.py`.
diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
index f28fdf3fe..20eff1c20 100644
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/test_block_fp8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from https://github.com/sgl-project/sglang/pull/2575
 import itertools
 
diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py
index 08f31219e..e653d34d0 100644
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import List, Optional, Tuple
 
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index c848be4f9..6f909b680 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import List, Tuple
 
diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py
index 8edfde42e..8cc1a6a1b 100755
--- a/tests/kernels/test_cascade_flash_attn.py
+++ b/tests/kernels/test_cascade_flash_attn.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple
 
 import pytest
diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/test_causal_conv1d.py
index 51be2425d..93064e23d 100644
--- a/tests/kernels/test_causal_conv1d.py
+++ b/tests/kernels/test_causal_conv1d.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional
 
 import pytest
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index f538d492c..49fd8ed63 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for cutlass kernels
 
 Run `pytest tests/kernels/test_cutlass.py`.
diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/test_cutlass_2of4_sparse.py
index 56495df34..4c613b75f 100644
--- a/tests/kernels/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/test_cutlass_2of4_sparse.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for sparse cutlass kernels
 
 Run `pytest tests/kernels/test_semi_structured.py`.
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index e008a56de..0d11e8652 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Tests:
 
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py
index 0ee0bf6c6..b8af89b66 100644
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple
 
 import pytest
diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py
index 1645ef911..212ceb5e4 100644
--- a/tests/kernels/test_flashinfer.py
+++ b/tests/kernels/test_flashinfer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple
 
 import flashinfer
diff --git a/tests/kernels/test_fp8_quant.py b/tests/kernels/test_fp8_quant.py
index ebaaae232..876cf03fd 100644
--- a/tests/kernels/test_fp8_quant.py
+++ b/tests/kernels/test_fp8_quant.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 
diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/test_fused_quant_layernorm.py
index baf8d73fd..d4b674b23 100644
--- a/tests/kernels/test_fused_quant_layernorm.py
+++ b/tests/kernels/test_fused_quant_layernorm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Tuple, Union
 
 import pytest
diff --git a/tests/kernels/test_ggml.py b/tests/kernels/test_ggml.py
index dddb285bf..dc728fd48 100644
--- a/tests/kernels/test_ggml.py
+++ b/tests/kernels/test_ggml.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import gguf
 import pytest
 import torch
diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py
index 893af99ba..847ca9f43 100644
--- a/tests/kernels/test_gguf.py
+++ b/tests/kernels/test_gguf.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from pathlib import Path
 from typing import List
 
diff --git a/tests/kernels/test_gptq.py b/tests/kernels/test_gptq.py
index c1ca6f1f5..fea013d9e 100644
--- a/tests/kernels/test_gptq.py
+++ b/tests/kernels/test_gptq.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from tests.kernels.utils import opcheck
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
index 761eb95c4..25dcb587e 100644
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 
diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py
index 727769e07..fa4bbe458 100644
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 
diff --git a/tests/kernels/test_machete_mm.py b/tests/kernels/test_machete_mm.py
index 1c6eb2dd9..bd60526ed 100644
--- a/tests/kernels/test_machete_mm.py
+++ b/tests/kernels/test_machete_mm.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for the machete kernel.
 
 Run `pytest tests/kernels/test_machete_mm.py`.
diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/test_mamba_ssm.py
index 19d1158c7..84d4c347e 100644
--- a/tests/kernels/test_mamba_ssm.py
+++ b/tests/kernels/test_mamba_ssm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 import torch.nn.functional as F
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index 5e047f4b0..b96aca06c 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for the marlin kernel.
 
 Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/test_mha_attn.py
index eab874e9e..5a18b7916 100644
--- a/tests/kernels/test_mha_attn.py
+++ b/tests/kernels/test_mha_attn.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Test:
 
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 7aa248ed1..0f13fbc96 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for the MOE layers.
 
 Run `pytest tests/kernels/test_moe.py`.
diff --git a/tests/kernels/test_permute_cols.py b/tests/kernels/test_permute_cols.py
index 14ad7a22c..35d62079f 100644
--- a/tests/kernels/test_permute_cols.py
+++ b/tests/kernels/test_permute_cols.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index eee77c22a..5b7b0fda2 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from itertools import accumulate, product
 from typing import Dict, List, Optional
 
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index 10e73ab95..2184c9852 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 import random
 import time
diff --git a/tests/kernels/test_rotary_embedding.py b/tests/kernels/test_rotary_embedding.py
index da879406b..362bcb35c 100644
--- a/tests/kernels/test_rotary_embedding.py
+++ b/tests/kernels/test_rotary_embedding.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Tests for miscellaneous utilities
 """
diff --git a/tests/kernels/test_triton_decode_attention.py b/tests/kernels/test_triton_decode_attention.py
index 14f5a3b77..fd3c9fa41 100644
--- a/tests/kernels/test_triton_decode_attention.py
+++ b/tests/kernels/test_triton_decode_attention.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 
diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/test_triton_scaled_mm.py
index a5aab3c2e..d878ed6f4 100644
--- a/tests/kernels/test_triton_scaled_mm.py
+++ b/tests/kernels/test_triton_scaled_mm.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for the triton_scaled_mm kernel
 
 Run `pytest tests/kernels/test_triton_scaled_mm.py`.
diff --git a/tests/kernels/test_utils.py b/tests/kernels/test_utils.py
index 7e5126a76..d3f032002 100644
--- a/tests/kernels/test_utils.py
+++ b/tests/kernels/test_utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Tests for miscellaneous utilities
 """
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index c735c5edd..5be111d71 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Kernel test utils"""
 
 import itertools
diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py
index adc6150ed..97e0d6eb1 100644
--- a/tests/kv_transfer/disagg_test.py
+++ b/tests/kv_transfer/disagg_test.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import subprocess
 import sys
diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/module_test.py
index 355461919..8a6490b5c 100644
--- a/tests/kv_transfer/module_test.py
+++ b/tests/kv_transfer/module_test.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import subprocess
 import sys
 
diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index 4d6890305..c5b34660d 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import random
 
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 1cc1ced99..181a5ac20 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import time
 from typing import List
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index e7378d007..071cdbecc 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import tempfile
 from collections import OrderedDict
 from typing import Dict, List, TypedDict
diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py
index 61b8899f0..2d33f738b 100644
--- a/tests/lora/data/long_context_test_data.py
+++ b/tests/lora/data/long_context_test_data.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa
 """This file contains a dictionary of prompts and golden responses."""
 
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 0ba2ce361..249f7619d 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index 49a527b99..0aa9fe7a9 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import vllm
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 5ae705e47..8923aa221 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_jamba.py b/tests/lora/test_jamba.py
index 6aa33926c..c04174665 100644
--- a/tests/lora/test_jamba.py
+++ b/tests/lora/test_jamba.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 08a589d7e..0838ca02c 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from copy import deepcopy
 from dataclasses import dataclass
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index dfeac3809..39f779f40 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import ray
diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
index e7a34f2ce..62005de73 100644
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import ast
 from typing import List, Optional, Tuple
 
diff --git a/tests/lora/test_lora_bias_e2e.py b/tests/lora/test_lora_bias_e2e.py
index c2520c847..cbdd68831 100644
--- a/tests/lora/test_lora_bias_e2e.py
+++ b/tests/lora/test_lora_bias_e2e.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index b907af47d..d2a4b901b 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 1c0ee01c0..273fe9ae0 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 9a5b9aabf..6666f54fd 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import Dict, List
 
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 3b0f18325..2e81bb326 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 940a86522..90cf8fd39 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py
index a524d5ce5..9935472ad 100644
--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import math
 import shutil
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index 5a3fcb8d6..651c89ffc 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import vllm
diff --git a/tests/lora/test_punica_ops_sizes.py b/tests/lora/test_punica_ops_sizes.py
index 433ca7577..ecd3bc497 100644
--- a/tests/lora/test_punica_ops_sizes.py
+++ b/tests/lora/test_punica_ops_sizes.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This script is mainly used to tests various hidden_sizes. We have collected the
 hidden_sizes included in the LoRA models currently supported by vLLM. It tests
diff --git a/tests/lora/test_punica_ops_variation.py b/tests/lora/test_punica_ops_variation.py
index 2bb84c1cf..6d1d3c943 100644
--- a/tests/lora/test_punica_ops_variation.py
+++ b/tests/lora/test_punica_ops_variation.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This script is mainly used to test whether trtion kernels can run normally
 under different conditions, including various batches, numbers of LoRA , and
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 26bf770cc..5702aa26b 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
 from dataclasses import dataclass
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 570aa3861..a988f06ab 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py
index d225a3f7d..589167e80 100644
--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index 85110b8fa..34a26e9ed 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import OrderedDict
 from unittest.mock import patch
 
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 9d814f657..797141ea3 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import random
 import tempfile
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index ce47546f2..bda00e081 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Optional
 
 import torch
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index b3c785055..0942c8eed 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from typing import List
 
diff --git a/tests/model_executor/conftest.py b/tests/model_executor/conftest.py
index 10792b0a0..b588a1a96 100644
--- a/tests/model_executor/conftest.py
+++ b/tests/model_executor/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 0a3aba255..2c6780848 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index be5282d9c..64d0928f8 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pickle
 
 import pytest
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index 9c1f784c1..760a11993 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import pytest
diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/weight_utils.py
index c8b9bed69..11dfe4d49 100644
--- a/tests/model_executor/weight_utils.py
+++ b/tests/model_executor/weight_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import tempfile
 
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 1e329dc4c..fe9361d12 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple, Type
 
 import numpy as np
diff --git a/tests/models/decoder_only/language/test_aqlm.py b/tests/models/decoder_only/language/test_aqlm.py
index a8cb5bbf9..85557b30d 100644
--- a/tests/models/decoder_only/language/test_aqlm.py
+++ b/tests/models/decoder_only/language/test_aqlm.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of a AQLM model between vLLM and HF Transformers
 
 Run `pytest tests/models/test_aqlm.py`.
diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py
index 5f06f1e3a..6a0e148d5 100644
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # flake8: noqa
 """Tests fp8 models against ground truth generation
 Note: these tests will only pass on L4 GPU.
diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py
index ad8f8a0c3..57fe1d5b1 100644
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Tests gguf models against unquantized models generations
 Note: To pass the test, quantization higher than Q4 should be used
diff --git a/tests/models/decoder_only/language/test_gptq_marlin.py b/tests/models/decoder_only/language/test_gptq_marlin.py
index 037411a18..0f61466c3 100644
--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compares the outputs of gptq vs gptq_marlin 
 Note: GPTQ and Marlin do not have bitwise correctness.
 As a result, in this test, we just confirm that the top selected tokens of the
diff --git a/tests/models/decoder_only/language/test_gptq_marlin_24.py b/tests/models/decoder_only/language/test_gptq_marlin_24.py
index 26cb3ec31..c81626148 100644
--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of a GPTQ model to a Marlin_24 model.
 
 Note: GPTQ and Marlin_24 do not have bitwise correctness.
diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py
index 5e93842f4..119b79d64 100644
--- a/tests/models/decoder_only/language/test_granite.py
+++ b/tests/models/decoder_only/language/test_granite.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for Granite models using greedy sampling.
 
 Run `pytest tests/models/test_granite.py`.
diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py
index 2e06b10fb..cc98f1d7b 100644
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from tests.utils import multi_gpu_test
diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py
index 1ad4f5aae..854f4fe4f 100644
--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM when using greedy sampling for Mamba.
 
 Run `pytest tests/models/test_mamba.py`.
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index bdc157178..179236730 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
 
 Run `pytest tests/models/test_mistral.py`.
diff --git a/tests/models/decoder_only/language/test_modelopt.py b/tests/models/decoder_only/language/test_modelopt.py
index 077e50e3a..66dd97957 100644
--- a/tests/models/decoder_only/language/test_modelopt.py
+++ b/tests/models/decoder_only/language/test_modelopt.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # flake8: noqa
 """Tests Model Optimizer fp8 models against ground truth generation
 Note: these tests will only pass on H100
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index c7efa4edb..1ad562415 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM when using greedy sampling.
 
 Run `pytest tests/models/test_models.py`.
diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/decoder_only/language/test_phimoe.py
index c997359a2..f9757d6ac 100644
--- a/tests/models/decoder_only/language/test_phimoe.py
+++ b/tests/models/decoder_only/language/test_phimoe.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for moe models using greedy sampling.
 
 Run `pytest tests/models/test_phimoe.py`.
diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py
index 18ceb34a4..31a5cd260 100644
--- a/tests/models/decoder_only/vision_language/test_awq.py
+++ b/tests/models/decoder_only/vision_language/test_awq.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Type
 
 import pytest
diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py
index 7406df253..9590adf6f 100644
--- a/tests/models/decoder_only/vision_language/test_h2ovl.py
+++ b/tests/models/decoder_only/vision_language/test_h2ovl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Tuple
 
 import pytest
diff --git a/tests/models/decoder_only/vision_language/test_intern_vit.py b/tests/models/decoder_only/vision_language/test_intern_vit.py
index 32fcb0bbc..a842d14fe 100644
--- a/tests/models/decoder_only/vision_language/test_intern_vit.py
+++ b/tests/models/decoder_only/vision_language/test_intern_vit.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional
 
 import pytest
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 62c644f73..e3cda8971 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Common tests for testing .generate() functionality for single / multiple
 image, embedding, and video support for different VLMs in vLLM.
 """
diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
index 3a8934adf..dd68fe4cd 100644
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import re
 from typing import List, Optional, Tuple, Type
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index 8103e5305..602da2b5f 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
 
 Run `pytest tests/models/test_mistral.py`.
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
index 5a485f3d8..de240a904 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, List, Optional, Tuple, Type, TypedDict, Union
 
 import numpy.typing as npt
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
index 59773be70..539410d18 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Helpers for building inputs that can be leveraged for different test types.
 """
 from pathlib import PosixPath
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
index 9bb713416..ca4ec2141 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Utils for determining which subset of model tests belong to a specific
 modality, getting all combinations (similar to pytest's parametrization),
 handling multimodal placeholder substitution, and so on.
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
index 54b7b0733..0aed26769 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Core test implementation to be shared across modalities."""
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
@@ -153,4 +154,4 @@ def process_runner_outputs(
 def process_outputs(output_processor, model, outputs_per_image):
     """Applies a model specific post-processor function to a runner's output"""
     return [[output_processor(res, model) for res in outputs]
-            for outputs in outputs_per_image]
\ No newline at end of file
+            for outputs in outputs_per_image]
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
index 2291f4fa0..2f03a114a 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Custom input builders for edge-cases in different models."""
 from typing import Callable
 
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 07bdb2cee..b0a88161c 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Common utility functions relating to different models that are useful
 for manipulating the input / output of HF & vLLM test runners, which are
 typically specific to a small subset of models.
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/runners.py b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
index 2d3b39fe3..fb9df37ca 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/runners.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Entrypoints for wrapping the core run_test implementation for specific test
 types / modalities.
 """
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py
index e2e0c6390..ae3b9d59b 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Types for writing multimodal model tests."""
 from enum import Enum
 from pathlib import PosixPath
diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py
index 0cbe4afe9..b0420ff5c 100644
--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/embedding/language/test_cls_models.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the classification outputs of HF and vLLM models.
 
 Run `pytest tests/models/test_cls_models.py`.
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index e17198e38..ad6385376 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the embedding outputs of HF and vLLM models.
 
 Run `pytest tests/models/embedding/language/test_embedding.py`.
diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py
index 55c2e5d4e..7ed2fb8a6 100644
--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/embedding/language/test_gritlm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import importlib.util
 import math
 from array import array
diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py
index 3db27d942..d6408258f 100644
--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the scoring outputs of HF and vLLM models.
 
 Run `pytest tests/models/embedding/language/test_scoring.py`.
diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py
index f96c7d2b1..567aa5098 100644
--- a/tests/models/embedding/utils.py
+++ b/tests/models/embedding/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Sequence
 
 import torch
diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
index 2641987b2..82f2bf531 100644
--- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import partial
 from typing import Callable, Dict, List, Type
 
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index f4cd8b81a..6ba3c5403 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Type
 
 import pytest
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py
index 9374c23dd..0cb948746 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Type
 
 import pytest
diff --git a/tests/models/encoder_decoder/audio_language/test_whisper.py b/tests/models/encoder_decoder/audio_language/test_whisper.py
index eb238c533..80d6897da 100644
--- a/tests/models/encoder_decoder/audio_language/test_whisper.py
+++ b/tests/models/encoder_decoder/audio_language/test_whisper.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
 
 Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/encoder_decoder/language/test_bart.py
index 10aba8427..81b629fdc 100644
--- a/tests/models/encoder_decoder/language/test_bart.py
+++ b/tests/models/encoder_decoder/language/test_bart.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for BART models using greedy sampling.
 
 Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
diff --git a/tests/models/encoder_decoder/vision_language/test_broadcast.py b/tests/models/encoder_decoder/vision_language/test_broadcast.py
index 542f41a38..8d986414e 100644
--- a/tests/models/encoder_decoder/vision_language/test_broadcast.py
+++ b/tests/models/encoder_decoder/vision_language/test_broadcast.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from ....utils import multi_gpu_test
diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py
index d686f1da3..a1d156799 100644
--- a/tests/models/encoder_decoder/vision_language/test_florence2.py
+++ b/tests/models/encoder_decoder/vision_language/test_florence2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import partial
 from typing import List, Optional, Tuple, Type
 
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 16c71228e..4cd2dbdb4 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple, Type, overload
 
 import pytest
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index ca28da268..3921d4e19 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import partial
 
 import numpy as np
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index 69b91ad4a..00c1dae51 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for Idefics3's multimodal preprocessing kwargs."""
 from typing import Optional
 
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index d6c60595c..0d921e9d3 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for InternVL's multimodal preprocessing kwargs."""
 from typing import Callable, Optional
 
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index 6de649f87..d2497e62d 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 from functools import partial
 
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index 806437d35..bd4dbd46d 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 from functools import partial
 
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index 7f82a8f18..44edec457 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for phi3v's multimodal preprocessing kwargs."""
 import pytest
 
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index de14fbbff..47c9b0add 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 7952e65aa..d0dbbf00e 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass, field
 from typing import AbstractSet, Any, Literal, Mapping, Optional
 
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index d3a3aaf67..64928a65d 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from unittest.mock import patch
 
 import pytest
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 2c413a633..ef665baa1 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import pytest
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index ac0366847..80d3f78f9 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import warnings
 
 import pytest
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 0eb3f61f1..e2be43c12 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import warnings
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 
diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py
index 782b508a5..808346b5e 100644
--- a/tests/mq_llm_engine/test_abort.py
+++ b/tests/mq_llm_engine/test_abort.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test that aborting is handled properly."""
 
 import asyncio
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
index 83bc4e7cf..35d001781 100644
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test that various errors are handled properly."""
 
 import asyncio
diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py
index 630c112d0..2069ff987 100644
--- a/tests/mq_llm_engine/test_load.py
+++ b/tests/mq_llm_engine/test_load.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test that the MQLLMEngine is able to handle 10k concurrent requests."""
 
 import asyncio
diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py
index f717c1355..11e44f12b 100644
--- a/tests/mq_llm_engine/utils.py
+++ b/tests/mq_llm_engine/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import multiprocessing
 from typing import Callable, Tuple, Union
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index b8524ed83..9822cee14 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Test the AsyncLLMEngine with multi-step-decoding
 from typing import List, Optional
 
diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py
index 34030d9d6..29d5ffd4c 100644
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Test the LLMEngine with multi-step-decoding
 
 import copy
diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py
index 678bbb52b..f5d3e282f 100644
--- a/tests/multimodal/test_inputs.py
+++ b/tests/multimodal/test_inputs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 13f820d01..6cccd2aa2 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from contextlib import nullcontext
 from typing import cast
 from unittest.mock import MagicMock
diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py
index d141cdf1f..5d18b2ed7 100644
--- a/tests/multimodal/test_processor_kwargs.py
+++ b/tests/multimodal/test_processor_kwargs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from array import array
 from typing import Callable, Dict, Mapping, Optional
 from unittest.mock import patch
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 198344e5b..f9e0f507a 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import base64
 import mimetypes
 import os
diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py
index 29aeca605..9a336b7e6 100644
--- a/tests/multimodal/utils.py
+++ b/tests/multimodal/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import numpy as np
 from PIL import Image
 
diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/test_prefix_prefill.py
index 77b707a73..dfbcfc15e 100644
--- a/tests/neuron/test_prefix_prefill.py
+++ b/tests/neuron/test_prefix_prefill.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import Optional
 
diff --git a/tests/plugins/vllm_add_dummy_model/setup.py b/tests/plugins/vllm_add_dummy_model/setup.py
index 9b535127f..e3fb6efb2 100644
--- a/tests/plugins/vllm_add_dummy_model/setup.py
+++ b/tests/plugins/vllm_add_dummy_model/setup.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from setuptools import setup
 
 setup(name='vllm_add_dummy_model',
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
index 62a8f871f..0c431cb39 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import ModelRegistry
 
 
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
index 5e7d7d187..3af62b288 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Iterable, List, Optional, Tuple, Union
 
 import torch
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index ac64edfd4..c23ab6430 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional
 
 import torch
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
index 569ef216c..bbd11ed4a 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional
 
 import torch
diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py
index 316399068..10df0b5e0 100644
--- a/tests/plugins/vllm_add_dummy_platform/setup.py
+++ b/tests/plugins/vllm_add_dummy_platform/setup.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from setuptools import setup
 
 setup(
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
index 594cef520..0d1b062ac 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional
 
 
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
index 5634be3c8..33425bbc1 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.attention.backends.flash_attn import FlashAttentionBackend
 
 
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index d7c6bdd70..5cefafc7e 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.platforms.cuda import CudaPlatform
 
 
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 661aa5f64..ed50fe535 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from tests.kernels.utils import override_backend_env_variable
diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py
index 5a28943b7..19f393e07 100644
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the with and without prefix caching.
 
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 8d16710f1..90d424fe3 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the with and without prefix caching.
 
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
diff --git a/tests/prompt_adapter/test_bloom.py b/tests/prompt_adapter/test_bloom.py
index 6528b3009..a31d8e873 100644
--- a/tests/prompt_adapter/test_bloom.py
+++ b/tests/prompt_adapter/test_bloom.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 import vllm
diff --git a/tests/prompt_adapter/test_multi_adapter_inference.py b/tests/prompt_adapter/test_multi_adapter_inference.py
index 39a79becd..e249a6e64 100644
--- a/tests/prompt_adapter/test_multi_adapter_inference.py
+++ b/tests/prompt_adapter/test_multi_adapter_inference.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import EngineArgs, LLMEngine, SamplingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 
diff --git a/tests/prompt_adapter/test_pa_lora.py b/tests/prompt_adapter/test_pa_lora.py
index 2a5f23f7f..fb4c3e149 100644
--- a/tests/prompt_adapter/test_pa_lora.py
+++ b/tests/prompt_adapter/test_pa_lora.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from huggingface_hub import snapshot_download
 
 from vllm import EngineArgs, LLMEngine, SamplingParams
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 569fc8dfb..4b5210cdf 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 '''Tests whether bitsandbytes computation is enabled correctly.
 
 Run `pytest tests/quantization/test_bitsandbytes.py`.
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 1072697ec..7e2e6f6ed 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test model set-up and weight loading for llmcompressor-quantized models.
 
 Run `pytest tests/quantization/test_compressed_tensors.py`.
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index cf77ccec7..0abbd8ebb 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests whether Marlin models can be loaded from the autogptq config.
 
 Run `pytest tests/quantization/test_configs.py --forked`.
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index 21ce5174c..29a5721ef 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Expanded quantized model tests for CPU offloading
 # Base tests: tests/basic_correctness/test_cpu_offload.py
 
diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py
index ec31c94ef..b6db6d5f2 100644
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # flake8: noqa
 """Tests experts_int8 quantization startup and generation, 
 doesn't test correctness
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 4bff73474..5616935eb 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests whether FP8 computation is enabled correctly.
 
 Run `pytest tests/quantization/test_fp8.py --forked`.
diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py
index 68a73f0f8..0e3913676 100644
--- a/tests/quantization/test_ipex_quant.py
+++ b/tests/quantization/test_ipex_quant.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test model set-up and inference for quantized HF models supported
  on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
  
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index fa2d9645e..ec60d8a57 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests whether gptq models with quantized lm_head can be loaded.
 
 Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index 11382ad70..491370c7c 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test model set-up and weight loading for quark-quantized models.
 
 Run `pytest tests/quantization/test_quark.py`.
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index 8e7f44a39..9e1867f91 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests register custom quantization config.
 
 See https://github.com/vllm-project/vllm/issues/11926 for more details.
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
index 8ebd8dd2b..7a339c162 100644
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.model_executor.layers.quantization import get_quantization_config
 from vllm.platforms import current_platform
 
diff --git a/tests/runai_model_streamer/test_runai_model_streamer_loader.py b/tests/runai_model_streamer/test_runai_model_streamer_loader.py
index c5722fbae..aa91fa8e1 100644
--- a/tests/runai_model_streamer/test_runai_model_streamer_loader.py
+++ b/tests/runai_model_streamer/test_runai_model_streamer_loader.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import SamplingParams
 from vllm.config import LoadConfig, LoadFormat
 from vllm.model_executor.model_loader.loader import (RunaiModelStreamerLoader,
diff --git a/tests/runai_model_streamer/test_weight_utils.py b/tests/runai_model_streamer/test_weight_utils.py
index 5c89bd78a..4afa76c51 100644
--- a/tests/runai_model_streamer/test_weight_utils.py
+++ b/tests/runai_model_streamer/test_weight_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import glob
 import tempfile
 
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index 4d1a6978d..39feb1895 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM when using beam search.
 
 Run `pytest tests/samplers/test_beam_search.py`.
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index dc2482d85..7f26698c9 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Make sure ignore_eos works.
 
 Run `pytest tests/samplers/test_ignore_eos.py`.
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 297947012..3b95b0389 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index c07c71e38..59d36099c 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index 4190cf7cd..cc6557694 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Make sure bad_words works.
 
 Run `pytest tests/samplers/test_no_bad_words.py`.
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index ed2fee1ae..c74c1c02c 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm import SamplingParams
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index dcb1b27bf..cc199bf68 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for rejection sampling."""
 from typing import List, Tuple
 
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 28c34064f..ca09e536a 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 import random
 from dataclasses import dataclass
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
index bf1ee6c39..4e8282561 100644
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Verify that seeded random sampling is deterministic.
 
 Run `pytest tests/samplers/test_seeded_generate.py`.
diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py
index 4ddad66dc..ecf98179c 100644
--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for rejection sampling."""
 
 import pytest
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 5cb982a08..53c888816 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from itertools import cycle
 from typing import List, Optional, Sequence, Tuple, Union
 
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index af8397c23..14a0ebf1d 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm import SamplingParams
diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py
index 5bc70de9d..6d1803f8b 100644
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py
index b89e58497..c67fa8514 100644
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests which cover integration of the speculative decoding framework with
 other features, e.g. cuda graphs.
 """
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index 7001ee4c0..e5a542b6d 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests which cover integration of the speculative decoding framework with
 tensor parallelism.
 """
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
index 2cb10de1c..cb9c46dc7 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests which cover integration of the speculative decoding framework with
 tensor parallelism.
 """
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index 1a543606c..5991a8b02 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from itertools import cycle
 
 import pytest
diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py
index dbcbc0db1..807f41cc9 100644
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index 1fa1104f5..a2b84b902 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index 05ad468dd..d396e52a9 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """The tests in this file verify end-to-end speculative decoding correctness.
 
 This docstring details important information on the testing methodology.
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
index 77f8b8998..1aff53cb5 100644
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py
index e42cf416b..b7d279f29 100644
--- a/tests/spec_decode/e2e/test_seed.py
+++ b/tests/spec_decode/e2e/test_seed.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from .conftest import run_equality_correctness_test
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index 3504fcf43..fe95ff9b9 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py
index aa49a3aee..0bff0ea1d 100644
--- a/tests/spec_decode/test_dynamic_spec_decode.py
+++ b/tests/spec_decode/test_dynamic_spec_decode.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from unittest.mock import MagicMock, patch
 
 import pytest
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
index 7477486a3..1a6693e16 100644
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from unittest.mock import MagicMock
 
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index 0b5d82b66..2bf401613 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import Dict, List
 from unittest.mock import MagicMock
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
index f66e95718..7de54b3ed 100644
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from vllm.sequence import ExecuteModelRequest
diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py
index 5a093dea1..7bbbb0236 100644
--- a/tests/spec_decode/test_scorer.py
+++ b/tests/spec_decode/test_scorer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import List
 
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index d8c3af4c1..eee0f4c89 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from collections import defaultdict
 from types import SimpleNamespace
diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py
index 195fce648..24573e224 100644
--- a/tests/spec_decode/test_utils.py
+++ b/tests/spec_decode/test_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from unittest.mock import MagicMock
 
 import pytest
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index 2f883c2ff..38f57e99b 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from itertools import count
 from typing import Callable, Dict, List, Optional
 from typing import Sequence as GenericSequence
diff --git a/tests/standalone_tests/lazy_torch_compile.py b/tests/standalone_tests/lazy_torch_compile.py
index b950877a4..b3b580952 100644
--- a/tests/standalone_tests/lazy_torch_compile.py
+++ b/tests/standalone_tests/lazy_torch_compile.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Description: Test the lazy import module
 # The utility function cannot be placed in `vllm.utils`
 # this needs to be a standalone script
diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py
index 2a4565362..694bb5fbc 100644
--- a/tests/tensorizer_loader/conftest.py
+++ b/tests/tensorizer_loader/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import functools
 import gc
 from typing import Callable, TypeVar
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 6e7eec1c6..b268d4bf0 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import gc
 import json
 import os
diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index e8f8499aa..17c128a17 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test hashing of cache blocks.
 
 Run `pytest tests/test_cache_block_hashing.py`.
diff --git a/tests/test_config.py b/tests/test_config.py
index ec366b93d..2dfae218b 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import asdict
 
 import pytest
diff --git a/tests/test_embedded_commit.py b/tests/test_embedded_commit.py
index ffeacf34b..a9b4f5cbf 100644
--- a/tests/test_embedded_commit.py
+++ b/tests/test_embedded_commit.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import vllm
 
 
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index fff7c5fc0..fff909154 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/test_logger.py b/tests/test_logger.py
index e3749616d..993822e92 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import logging
 import os
diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py
index 39c1c3815..487fbb8fc 100644
--- a/tests/test_logits_processor.py
+++ b/tests/test_logits_processor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import Tuple
 from unittest.mock import patch
diff --git a/tests/test_regression.py b/tests/test_regression.py
index 5d27d3579..f781b3113 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Containing tests that check for regressions in vLLM's behavior.
 
 It should include tests that are reported by users and making sure they
diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py
index 01cbe0c99..40e26ed51 100644
--- a/tests/test_sampling_params.py
+++ b/tests/test_sampling_params.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for the SamplingParams class.
 """
 from vllm import SamplingParams
diff --git a/tests/test_scalartype.py b/tests/test_scalartype.py
index a9221f08c..6e36f2c33 100644
--- a/tests/test_scalartype.py
+++ b/tests/test_scalartype.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 
diff --git a/tests/test_sequence.py b/tests/test_sequence.py
index 30e53a180..902de1099 100644
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.model_executor.layers.sampler import SamplerOutput
diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index 2412da503..088b95be7 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import multiprocessing as mp
 import os
 import shutil
diff --git a/tests/test_utils.py b/tests/test_utils.py
index d5dc4464e..5b69ffd18 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 import socket
diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py
index 4c8238fd8..cd60cefd7 100644
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenization/test_cached_tokenizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from copy import deepcopy
 
 from transformers import AutoTokenizer
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 84348cbc0..57832394d 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, Generator, List, Optional
 
 import pytest
diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py
index 875ca19d3..787fb6ea6 100644
--- a/tests/tokenization/test_get_eos.py
+++ b/tests/tokenization/test_get_eos.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This test file includes some cases where it is inappropriate to
 only get the `eos_token_id` from the tokenizer as defined by
diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py
index 8db7204f1..eddc63098 100644
--- a/tests/tokenization/test_tokenizer.py
+++ b/tests/tokenization/test_tokenizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 from transformers import PreTrainedTokenizerBase
 
diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
index 3faaf326f..8e99f8691 100644
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 import sys
diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py
index 294acf202..39ab01c9b 100644
--- a/tests/tool_use/conftest.py
+++ b/tests/tool_use/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import pytest_asyncio
 from huggingface_hub import snapshot_download
diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py
index 3d0fe8f06..7bee56281 100644
--- a/tests/tool_use/test_chat_completion_request_validations.py
+++ b/tests/tool_use/test_chat_completion_request_validations.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
index 75bbfbb76..da033fa1d 100644
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import openai
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index 3095ef451..7e349c512 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 from typing import Generator, List, Optional
 
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
index c294cb049..b49a5e8e7 100644
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 from typing import Dict, List, Optional
 
diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py
index fe8cb496c..45f1bfc45 100644
--- a/tests/tool_use/test_tool_calls.py
+++ b/tests/tool_use/test_tool_calls.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 from typing import Dict, List, Optional
 
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 2241f1846..a7dfb1078 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from copy import deepcopy
 from typing import Any, Dict, List, Optional
 
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index b7124ebc1..6ed83f30e 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import glob
 import os
 import tempfile
diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
index bb1379deb..e94bbd287 100644
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 from vllm.config import CompilationLevel
diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py
index 6cd5615c4..3db9bc73a 100644
--- a/tests/tpu/test_quantization_accuracy.py
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 
 import lm_eval
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index 49a16d16e..592775e8b 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import threading
 from concurrent import futures
diff --git a/tests/utils.py b/tests/utils.py
index f4eecf19e..3b32052fe 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import copy
 import functools
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 0a5ba1f98..60cf4384d 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.multimodal.inputs import MultiModalKwargs
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 5c1cda285..2e16d7d25 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the with and without prefix caching."""
 import pytest
 
diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
index 8ec9f1ba3..a8079dcce 100644
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 
 
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 10f783b21..4b5bc9ced 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from contextlib import ExitStack
 from typing import List, Tuple
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index ff38a4568..a3540582a 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm import envs
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 033bbcfce..6a91f1901 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 import uuid
 
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index e2c728b22..b2539132f 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import time
 import uuid
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 4735c6f94..5782a249f 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 5ebf72927..f7eedcb9c 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Set, Tuple
 
 import numpy as np
diff --git a/tests/v1/test_stats.py b/tests/v1/test_stats.py
index 580392ac5..48419d8a2 100644
--- a/tests/v1/test_stats.py
+++ b/tests/v1/test_stats.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.sampling_params import SamplingParams
diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py
index ac773b611..9b669ae00 100644
--- a/tests/v1/test_utils.py
+++ b/tests/v1/test_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import torch
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 694ce81ff..5b40fbff8 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Set, Tuple
 
 import numpy as np
diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py
index 790e891ec..c03943149 100644
--- a/tests/vllm_test_utils/setup.py
+++ b/tests/vllm_test_utils/setup.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from setuptools import setup
 
 setup(
diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py
index 6505c8154..1d1219fbe 100644
--- a/tests/vllm_test_utils/vllm_test_utils/__init__.py
+++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 vllm_utils is a package for vLLM testing utilities.
 It does not import any vLLM modules.
diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py
index 1ddd3471d..392fd2705 100644
--- a/tests/vllm_test_utils/vllm_test_utils/blame.py
+++ b/tests/vllm_test_utils/vllm_test_utils/blame.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import contextlib
 import dataclasses
 import sys
diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py
index a237f53a7..44d45f262 100644
--- a/tests/vllm_test_utils/vllm_test_utils/monitor.py
+++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import contextlib
 import dataclasses
 import sys
diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index 7a3786456..e456bfab8 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import pytest
diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
index a6b3cb575..0ce0465a7 100644
--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 from typing import List
 
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index 57f1fd47a..eb341fb1b 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import List, Tuple, Type
 
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index aabe913c2..c32ceb4fa 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
index 79233c757..22466105b 100644
--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from vllm.engine.arg_utils import EngineArgs
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index acede959f..7ae0f4bb8 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from vllm.engine.arg_utils import EngineArgs
diff --git a/tools/check_spdx_header.py b/tools/check_spdx_header.py
new file mode 100644
index 000000000..3f7fd66bf
--- /dev/null
+++ b/tools/check_spdx_header.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+SPDX_HEADER = "# SPDX-License-Identifier: Apache-2.0"
+SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:"
+
+
+def check_spdx_header(file_path):
+    with open(file_path, encoding='UTF-8') as file:
+        lines = file.readlines()
+        if not lines:
+            # not necessary for an empty file like __init__.py
+            return True
+        if not lines[0].strip().startswith(SPDX_HEADER_PREFIX):
+            return False
+    return True
+
+
+def add_header(file_path):
+    with open(file_path, 'r+', encoding='UTF-8') as file:
+        lines = file.readlines()
+        file.seek(0, 0)
+        file.write(SPDX_HEADER + '\n\n' + ''.join(lines))
+
+
+def main():
+    files_with_missing_header = []
+    for file_path in sys.argv[1:]:
+        if not check_spdx_header(file_path):
+            files_with_missing_header.append(file_path)
+
+    if files_with_missing_header:
+        print("The following files are missing the SPDX header:")
+        for file_path in files_with_missing_header:
+            print(f"  {file_path}")
+            add_header(file_path)
+
+    sys.exit(1 if files_with_missing_header else 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 54cd60c2b..adbb7301b 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import json
 from typing import Dict
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index cb56ebd69..c527cdbe0 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import copy
 import json
diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py
index 9dc19f5fd..33e85b9ff 100644
--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 #!/usr/bin/env python3
 # Copyright (c) 2018 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
diff --git a/use_existing_torch.py b/use_existing_torch.py
index 319d26289..a578328b0 100644
--- a/use_existing_torch.py
+++ b/use_existing_torch.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import glob
 
 requires_files = glob.glob('requirements*.txt')
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 2aabe820d..566c5116d 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 import os
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index da237da2e..ce4f75341 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import contextlib
 import importlib
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 28b804f76..ccb67baa5 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple
 
 import torch
diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py
index 3ed60678b..18e0c5227 100644
--- a/vllm/adapter_commons/layers.py
+++ b/vllm/adapter_commons/layers.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Tuple
 
diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py
index 468904c90..f9a5d2fff 100644
--- a/vllm/adapter_commons/models.py
+++ b/vllm/adapter_commons/models.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, Optional, TypeVar
 
diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py
index 2bb17fdc0..2b604b91b 100644
--- a/vllm/adapter_commons/request.py
+++ b/vllm/adapter_commons/request.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 
 
diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py
index 1e9adca50..c2dc5433c 100644
--- a/vllm/adapter_commons/utils.py
+++ b/vllm/adapter_commons/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Callable, Dict, Optional, Set
 
 
diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py
index 83929e82e..ce24e08a5 100644
--- a/vllm/adapter_commons/worker_manager.py
+++ b/vllm/adapter_commons/worker_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Any, Optional, Set
 
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index a46c67ad7..d9e51082e 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Literal
 from urllib.parse import urljoin
diff --git a/vllm/assets/base.py b/vllm/assets/base.py
index 249173141..03f3b9dab 100644
--- a/vllm/assets/base.py
+++ b/vllm/assets/base.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import lru_cache
 from pathlib import Path
 from typing import Optional
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index 0a55506f8..2b1d258da 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Literal
 
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index eca2ccc54..494cfc383 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from functools import lru_cache
 from typing import List, Literal
diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py
index 2cd4ad3e0..85c5715fa 100644
--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.attention.backends.abstract import (AttentionBackend,
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index b9425f659..5f0a54013 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from dataclasses import dataclass, fields
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 20e9a3f13..9765e7881 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Tuple, Type
 
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 4a9aa1e21..6a82127ac 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Attention layer with FlashAttention."""
 from collections import defaultdict
 from dataclasses import dataclass
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 7cccef960..715ed6748 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from collections import defaultdict
 from contextlib import contextmanager
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 80c132c0a..1518e518e 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 57916a3c6..b4879af4c 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """ Attention layer with torch scaled_dot_product_attention
     and PagedAttention."""
 from dataclasses import dataclass
diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
index e8fec234c..9b63192ed 100644
--- a/vllm/attention/backends/mla/utils.py
+++ b/vllm/attention/backends/mla/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import abstractmethod
 from dataclasses import dataclass
 from typing import Any, Dict, Generic, List, Optional, Tuple
diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py
index be06d1600..f58528dbf 100644
--- a/vllm/attention/backends/openvino.py
+++ b/vllm/attention/backends/openvino.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple, Type
 
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 209a623ba..b61dfe63d 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
 
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index 826311896..9f6e731af 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 12110ec73..02bff57a6 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Attention layer ROCm GPUs."""
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index c3b2398b4..25fe6ed95 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """ Attention layer with torch scaled_dot_product_attention
     and PagedAttention."""
 from dataclasses import dataclass
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
index 95dc119a4..20d7ef0fa 100644
--- a/vllm/attention/backends/triton_mla.py
+++ b/vllm/attention/backends/triton_mla.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 7f2fe7e83..ad53e4e70 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Attention backend utils"""
 from collections import defaultdict
 from contextlib import contextmanager
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 49f47f9c8..723a4558d 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Attention layer with xFormers and PagedAttention."""
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index b97165f62..19ee89630 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Attention layer."""
 from typing import Any, Dict, List, Optional
 
diff --git a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
index 727a470ba..71caf3cba 100644
--- a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
+++ b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 import triton
 import triton.language as tl
diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py
index 350f88c8f..6ab69ea5b 100644
--- a/vllm/attention/ops/blocksparse_attention/interface.py
+++ b/vllm/attention/ops/blocksparse_attention/interface.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 
 import torch
diff --git a/vllm/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py
index 78d752230..4de9bd530 100644
--- a/vllm/attention/ops/blocksparse_attention/utils.py
+++ b/vllm/attention/ops/blocksparse_attention/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Helper functions for 3D sparse pattern
 # These function are not optimized and very inefficient.
 # Avoid calling them too frequent or use a cache mechanism.
diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py
index 4c0fb2a62..8bb536343 100644
--- a/vllm/attention/ops/hpu_paged_attn.py
+++ b/vllm/attention/ops/hpu_paged_attn.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
index 3a07184ed..598ceea13 100644
--- a/vllm/attention/ops/ipex_attn.py
+++ b/vllm/attention/ops/ipex_attn.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Optional, Tuple
 
 try:
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index 9de4ef7f5..68aa63f5a 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 
 import neuronxcc.nki.isa as nisa
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index fd6232914..2c60bd0c3 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
 
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index ec3c8459c..fbb6757ee 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # The kernels in this file are adapted from LightLLM's context_attention_fwd:
 # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py
 
diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py
index 675df109b..ec5ec4ce6 100644
--- a/vllm/attention/ops/triton_decode_attention.py
+++ b/vllm/attention/ops/triton_decode_attention.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
 # which was originally adapted from
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index ef04603f2..ab8fb8953 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 #!/usr/bin/env python
 """
 Fused Attention
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 4c6bbc727..26c6ac812 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from contextlib import contextmanager
 from functools import cache
diff --git a/vllm/beam_search.py b/vllm/beam_search.py
index 026037e54..97b2b630f 100644
--- a/vllm/beam_search.py
+++ b/vllm/beam_search.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 7f4f97466..979890170 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import ast
 import copy
 import dataclasses
diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
index 6385f1c5d..a6f11a3af 100644
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import copy
 import dataclasses
 from contextlib import contextmanager
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 17eb0592c..20afe6967 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import inspect
 from typing import Callable, Dict, List, Optional, TypeVar, Union, overload
 from unittest.mock import patch
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index e15d7b315..9b0e9c5d0 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import operator
 from typing import Dict, Iterable, List, Optional, Tuple, Union
 
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index cde27bd10..0c3d8697b 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
 
 import torch
diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py
index 924e26f2e..b9a8d3112 100644
--- a/vllm/compilation/fx_utils.py
+++ b/vllm/compilation/fx_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import operator
 from typing import Iterable, Optional
 
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index f6846c08a..be663946f 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import hashlib
 import inspect
 import types
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index b97e40415..786c7c1e1 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import time
 
diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py
index b6bcecdc8..e6f6a60b2 100644
--- a/vllm/compilation/multi_output_match.py
+++ b/vllm/compilation/multi_output_match.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import abc
 import operator
 from abc import abstractmethod
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 34f5f3557..c7387fb7c 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List
 
 from torch import fx as fx
diff --git a/vllm/compilation/reshapes.py b/vllm/compilation/reshapes.py
index ba28b1f0b..292baae85 100644
--- a/vllm/compilation/reshapes.py
+++ b/vllm/compilation/reshapes.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Union
 
 import torch.fx
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
index b8c52a7f4..1d2597e42 100644
--- a/vllm/compilation/vllm_inductor_pass.py
+++ b/vllm/compilation/vllm_inductor_pass.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 
 import torch
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 58a8fa76f..a8a283ddd 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import sys
 from abc import abstractmethod
diff --git a/vllm/config.py b/vllm/config.py
index a13700aba..d2d59c705 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import ast
 import copy
 import enum
diff --git a/vllm/connections.py b/vllm/connections.py
index 4c9f4f40c..dc060bb6f 100644
--- a/vllm/connections.py
+++ b/vllm/connections.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from pathlib import Path
 from typing import Mapping, MutableMapping, Optional
 from urllib.parse import urlparse
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index 90c1438ef..d4d31c58d 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from typing import List, Optional
 
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index 115f663e4..1966eac1c 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import deque
 from dataclasses import dataclass
 from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index c3e1665b4..359b5b263 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, FrozenSet, List, Optional, Tuple
 
 from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index cb432db91..0b0197deb 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
 
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index c38ae2dd6..c388366b8 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import deque
 from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
 
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index ccdc5daa9..fbf19e1b4 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Token blocks."""
 import sys
 from bisect import bisect_left
diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
index 1c6578e4c..910afdd9f 100644
--- a/vllm/core/block/utils.py
+++ b/vllm/core/block/utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Block manager utils."""
 from vllm.sequence import SequenceGroup
 from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 2d6a132ed..c5b3b04f3 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A block manager that manages token blocks."""
 from typing import Dict, List, Optional
 from typing import Sequence as GenericSequence
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index c93065182..0e363eddc 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 import heapq
 from abc import ABC, abstractmethod
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index 9c7e246e3..b48ba87e9 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 from abc import ABC, abstractmethod
 from typing import List
diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py
index f9924be4a..70c22afa8 100644
--- a/vllm/core/placeholder_block_space_manager.py
+++ b/vllm/core/placeholder_block_space_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Tuple
 
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 2bb961481..f507847ad 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 import os
 import random
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index a43418dbb..f74ad9ac3 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # cumem-based pytorch pluggable allocator to implement sleep mode.
 # other approaches tried but failed:
 # - cuda-python package binding
diff --git a/vllm/distributed/__init__.py b/vllm/distributed/__init__.py
index db325cfab..39955ddac 100644
--- a/vllm/distributed/__init__.py
+++ b/vllm/distributed/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .communication_op import *
 from .parallel_state import *
 from .utils import *
diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index e13505dc3..0228264f9 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index d5a53381c..010caf7eb 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """This file is a pure Python wrapper for the cudart library.
 It avoids the need to compile a separate shared library, and is
 convenient for use when we just need to call a few functions.
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 62929dc0f..a2614ed5d 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import ctypes
 from contextlib import contextmanager
 from typing import List, Optional, Union
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index 1f78e10cc..d8d6eed2d 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import ctypes
 import json
 import os
diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py
index cc9b19ce0..3f85da98a 100644
--- a/vllm/distributed/device_communicators/hpu_communicator.py
+++ b/vllm/distributed/device_communicators/hpu_communicator.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index efc599871..0ccd42312 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Union
 
 # ===================== import region =====================
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
index 7dea61b6a..03c3b0be7 100644
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # This file is a pure Python wrapper for the NCCL library.
 # The main purpose is to use NCCL combined with CUDA graph.
 # Before writing this script, we tried the following approach:
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 268edc092..48ac81ac0 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import pickle
 import sys
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index 765a0f9cb..7af7c65f6 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import torch
diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
index eafd3c2f6..79ccc101e 100644
--- a/vllm/distributed/device_communicators/xpu_communicator.py
+++ b/vllm/distributed/device_communicators/xpu_communicator.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py
index 6089e3bab..57c764b48 100644
--- a/vllm/distributed/kv_transfer/kv_connector/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/base.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 KVConnectorBase Class for Distributed KV Cache & Hidden State communication
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 6372dab72..fe4805334 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import importlib
 from typing import TYPE_CHECKING, Callable, Dict, Type
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 7780e2dfa..2033e9762 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Simple KV Cache Connector for Distributed Machine Learning Inference
 
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
index bad119a1a..845da7c50 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This file contains a new class `KVLookupBufferBase` that allows developers to 
 think of KV cache operations as inserting new KV cache entries (`insert`) 
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index fe8d8d737..5e1b62352 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
     Implements a distributed key-value (KV) cache transfer mechanism.
 
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
index 4b0cb44cc..40589fb3e 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This file defines an interface `KVPipeBase`
 that provides an abstraction for sending and receiving tensors, or None, via
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
index 8e4358672..58ab7f0b6 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import os
 import pickle
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
index 98222fa67..7aa53d07a 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
     This module implements a PyNccl pipe for sending and receiving 
     Optional[torch.Tensor] between distributed ranks with advanced 
diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_transfer_agent.py
index 9ce97851d..1e80e0bd7 100644
--- a/vllm/distributed/kv_transfer/kv_transfer_agent.py
+++ b/vllm/distributed/kv_transfer/kv_transfer_agent.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A centralized entrypoint to perform distributed KV cache transfer.
 
 This implementation is a shim wrapper on two APIs exposed by `kv_connector`:
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 7fe9b68d4..c5c5dfbba 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2023 The vLLM team.
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index dcfcb848c..84f8c0a8e 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2023 The vLLM team.
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index cc7c99e50..7c0e8c214 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import dataclasses
 import json
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 739ea06ae..053635a28 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import copy
 import time
diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py
index 4b1842625..aa54c0693 100644
--- a/vllm/engine/async_timeout.py
+++ b/vllm/engine/async_timeout.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Workaround for https://github.com/python/cpython/issues/86296
 #
 # From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index dd677300f..d82d9ad9d 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import copy
 import time
 from collections import Counter as collectionsCounter
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index b771c190d..ce806b4a9 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from typing import TYPE_CHECKING
 from typing import Counter as CollectionsCounter
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
index 5c7a430d1..7f0c2fa70 100644
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 These types are defined in this file to avoid importing vllm.engine.metrics
 and therefore importing prometheus_client.
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index d9703b820..3cf1850ee 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import uuid
 from dataclasses import dataclass, field
 from enum import Enum
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 5237f63c3..85b5f31e3 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import copy
 import pickle
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 166f89743..a0dd79586 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pickle
 import signal
 from contextlib import contextmanager
diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py
index 50adaf4e5..4c8e295c1 100644
--- a/vllm/engine/output_processor/interfaces.py
+++ b/vllm/engine/output_processor/interfaces.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Callable, List
 
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 99c2baf3f..8ceef855e 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import functools
 from typing import Callable, List, cast
 
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index 55c56abea..4d96791a1 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 from vllm.config import SchedulerConfig
diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py
index 4b701f815..3bca0bee3 100644
--- a/vllm/engine/output_processor/stop_checker.py
+++ b/vllm/engine/output_processor/stop_checker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional, Tuple
 
 from vllm.lora.request import LoRARequest
diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py
index 770982a20..0d2b58c10 100644
--- a/vllm/engine/output_processor/util.py
+++ b/vllm/engine/output_processor/util.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 from typing import Sequence as GenericSequence
 from typing import cast
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index de7b2c1b9..d11125586 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from abc import ABC, abstractmethod
 from typing import AsyncGenerator, List, Mapping, Optional
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index daefbff7e..96818507d 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 NOTE: This API server is used only for demonstrating usage of AsyncEngine
 and simple performance benchmarks. It is not intended for production use.
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 97d2561df..3a6e75b1d 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import codecs
 import json
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index 5dcf50bd1..351a39525 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import signal
 from http import HTTPStatus
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 46b595b0d..d071a0b3c 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 import warnings
 from contextlib import contextmanager
diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py
index 584ee0d9e..e82b6ba6c 100644
--- a/vllm/entrypoints/logger.py
+++ b/vllm/entrypoints/logger.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Union
 
 from vllm.logger import init_logger
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 9e5cf4ba2..b8f54d6c7 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import atexit
 import gc
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 9cfe07c65..3054958f3 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This file contains the command line arguments for the vLLM's
 OpenAI-compatible server. It is kept in a separate file for documentation
diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
index c8132811d..41e5eef40 100644
--- a/vllm/entrypoints/openai/logits_processors.py
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import lru_cache, partial
 from typing import Dict, FrozenSet, Iterable, List, Optional, Union
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 29d071ce5..83b841826 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
 import re
diff --git a/vllm/entrypoints/openai/reasoning_parsers/__init__.py b/vllm/entrypoints/openai/reasoning_parsers/__init__.py
index a21bff52f..80354d69b 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/__init__.py
+++ b/vllm/entrypoints/openai/reasoning_parsers/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
 from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
 
diff --git a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
index e5d10ee0b..b5df7e474 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
+++ b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from functools import cached_property
 from typing import Callable, Dict, List, Optional, Sequence, Tuple, Type, Union
diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
index a440ddc8d..5c19888d4 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
+++ b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import re
 from typing import Optional, Sequence, Tuple, Union
 
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 37ae23506..675d3cdcf 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from http import HTTPStatus
 from io import StringIO
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index dc97f0eb0..107220d54 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import json
 import time
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 13c392636..e7ad263e7 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import time
 from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index e7116a3d9..45f8ad90d 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import base64
 import time
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 8d54164e5..8d39fdcb7 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 from concurrent.futures.thread import ThreadPoolExecutor
 from http import HTTPStatus
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 22e74b387..f917a4851 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import pathlib
 from dataclasses import dataclass
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 583032207..01a3d211f 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import base64
 import time
diff --git a/vllm/entrypoints/openai/serving_rerank.py b/vllm/entrypoints/openai/serving_rerank.py
index be4420261..366df7121 100644
--- a/vllm/entrypoints/openai/serving_rerank.py
+++ b/vllm/entrypoints/openai/serving_rerank.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast
 
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 381edf8fa..832aa8516 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import time
 from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index b67ecfb01..6c79adf90 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Final, List, Optional, Union
 
 from fastapi import Request
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 2850349a4..d1c3afa64 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .abstract_tool_parser import ToolParser, ToolParserManager
 from .granite_20b_fc_tool_parser import Granite20bFCToolParser
 from .granite_tool_parser import GraniteToolParser
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index aa7c20109..7cdd6d4c4 100644
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from functools import cached_property
 from typing import Callable, Dict, List, Optional, Sequence, Type, Union
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index 93e357e8b..002bf1738 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import re
 from json import JSONDecoder
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index 8aefcd8d5..c948ed78f 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 from typing import Dict, Sequence, Union
 
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 869d15ac3..4841b2870 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import re
 from typing import Dict, List, Sequence, Union
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index cb391e11b..b9215e797 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 from typing import Dict, Sequence, Union
 
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index cfd024853..7c4d63e18 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import re
 from typing import Dict, List, Sequence, Union
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 1856308b8..6a7b11362 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import re
 from json import JSONDecoder
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index bada805dd..51354f7c9 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import re
 from random import choices
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index 26da4d689..5c282b5c2 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import ast
 import json
 import re
diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py
index 5e4eb23bf..945cbd683 100644
--- a/vllm/entrypoints/openai/tool_parsers/utils.py
+++ b/vllm/entrypoints/openai/tool_parsers/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 from json import JSONDecodeError, JSONDecoder
 from typing import Any, List, Tuple
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index e8a78d216..9af37871d 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import functools
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 25098070b..78ee3047b 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import tempfile
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 471d1bfac..fb76276bb 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from abc import ABC, abstractmethod
 from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py
index 78c86321d..d1f8c36fb 100644
--- a/vllm/executor/mp_distributed_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 from typing import Any, Callable, List, Optional, Union
diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py
index c467115f1..e680d53cb 100644
--- a/vllm/executor/msgspec_utils.py
+++ b/vllm/executor/msgspec_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from array import array
 from typing import Any, Type
 
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index 539b6ae2d..cef6a994a 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 import sys
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 2afd99f99..80e7a1c40 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 from collections import defaultdict
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index e55155ea0..5d5cc8398 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import time
 from collections import defaultdict
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index a5c4dcf0e..dcb4a8f27 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 828b394ec..10de8bc59 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from collections import defaultdict
 from contextlib import contextmanager
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index a0dd89f69..6f8f2cd75 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .data import (DecoderOnlyInputs, EncoderDecoderInputs,
                    ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
                    SingletonInputs, SingletonInputsAdapter, SingletonPrompt,
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 57e85779d..2ffebeee3 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from functools import cached_property
 from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal,
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index 09f1ff2cb..454d9d830 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Literal, Sequence, TypedDict, Union, cast, overload
 
 from typing_extensions import TypeIs
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 70372e0ca..4d8f28cb0 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from typing import List, Mapping, Optional, Union
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 4b73ade7a..0ec726b8b 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import functools
 from collections import UserDict
 from dataclasses import dataclass
diff --git a/vllm/logger.py b/vllm/logger.py
index cac174f7b..b20d55e3c 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Logging configuration for vLLM."""
 import datetime
 import json
diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py
index 576ccf78a..7ab463258 100644
--- a/vllm/logging_utils/__init__.py
+++ b/vllm/logging_utils/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.logging_utils.formatter import NewLineFormatter
 
 __all__ = [
diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py
index b24b4e11d..010b0a124 100644
--- a/vllm/logging_utils/formatter.py
+++ b/vllm/logging_utils/formatter.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 
 
diff --git a/vllm/logits_process.py b/vllm/logits_process.py
index 7716ccd27..d02072e8f 100644
--- a/vllm/logits_process.py
+++ b/vllm/logits_process.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Tuple, Union
 
 import torch
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index 545ec21ca..3d6620817 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # pylint: disable=unused-argument
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index cdd439d03..9f0297596 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # pylint: disable=unused-argument
 import math
 from dataclasses import dataclass
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index 93ad4651f..00299bf6c 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional
 from typing import Sequence as GenericSequence
 
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 2e04cb902..ef77fd4b7 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import copy
 import math
 import os
diff --git a/vllm/lora/ops/torch_ops/__init__.py b/vllm/lora/ops/torch_ops/__init__.py
index 9c9159b95..85601d58c 100644
--- a/vllm/lora/ops/torch_ops/__init__.py
+++ b/vllm/lora/ops/torch_ops/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand  # noqa: F401
 from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink,
                                               sgmv_expand, sgmv_expand_slice,
diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py
index 5f5aafd51..af79f9841 100644
--- a/vllm/lora/ops/torch_ops/lora_ops.py
+++ b/vllm/lora/ops/torch_ops/lora_ops.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 
diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py
index 9805b6dd5..dc440f732 100644
--- a/vllm/lora/ops/triton_ops/__init__.py
+++ b/vllm/lora/ops/triton_ops/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
 from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
 from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
diff --git a/vllm/lora/ops/triton_ops/bgmv_expand.py b/vllm/lora/ops/triton_ops/bgmv_expand.py
index 42adb191b..98510b396 100644
--- a/vllm/lora/ops/triton_ops/bgmv_expand.py
+++ b/vllm/lora/ops/triton_ops/bgmv_expand.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/ops/triton_ops/bgmv_expand_slice.py b/vllm/lora/ops/triton_ops/bgmv_expand_slice.py
index f397d752a..48804123c 100644
--- a/vllm/lora/ops/triton_ops/bgmv_expand_slice.py
+++ b/vllm/lora/ops/triton_ops/bgmv_expand_slice.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/ops/triton_ops/bgmv_shrink.py b/vllm/lora/ops/triton_ops/bgmv_shrink.py
index f3ef01d39..227a5765e 100644
--- a/vllm/lora/ops/triton_ops/bgmv_shrink.py
+++ b/vllm/lora/ops/triton_ops/bgmv_shrink.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py
index 48fa5cd63..a8e71cacf 100644
--- a/vllm/lora/ops/triton_ops/sgmv_expand.py
+++ b/vllm/lora/ops/triton_ops/sgmv_expand.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py
index 9bb35e8ff..8b26583c1 100644
--- a/vllm/lora/ops/triton_ops/sgmv_shrink.py
+++ b/vllm/lora/ops/triton_ops/sgmv_shrink.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
index 7df5bc2c2..78409b91a 100644
--- a/vllm/lora/ops/triton_ops/utils.py
+++ b/vllm/lora/ops/triton_ops/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import functools
 from typing import Dict, List, Tuple
 
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index b9c506f6e..9496ab5a7 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py
 
 import json
diff --git a/vllm/lora/punica_wrapper/__init__.py b/vllm/lora/punica_wrapper/__init__.py
index 48ada3926..915fc6623 100644
--- a/vllm/lora/punica_wrapper/__init__.py
+++ b/vllm/lora/punica_wrapper/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
 from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper
 
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index b9ec0c4bc..1a2282ae9 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py
index b9ae3e074..29428f4cf 100644
--- a/vllm/lora/punica_wrapper/punica_cpu.py
+++ b/vllm/lora/punica_wrapper/punica_cpu.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, Optional, Tuple, Union
 
 import torch
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index 451f23e49..9ccd9c36a 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py
index d9c4f44a1..51e1bfab3 100644
--- a/vllm/lora/punica_wrapper/punica_hpu.py
+++ b/vllm/lora/punica_wrapper/punica_hpu.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Tuple, Union, final
 
 import torch
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index a29322465..ad5d4b788 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import resolve_obj_by_qualname
diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py
index 7360c8c09..dbc2d27c5 100644
--- a/vllm/lora/punica_wrapper/utils.py
+++ b/vllm/lora/punica_wrapper/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 import torch
diff --git a/vllm/lora/request.py b/vllm/lora/request.py
index 5e3d2f0ed..badfaa419 100644
--- a/vllm/lora/request.py
+++ b/vllm/lora/request.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import warnings
 from typing import Optional
 
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index d72b7638d..f47b0af15 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import re
 from typing import List, Optional, Set, Tuple, Type, Union
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index a64296f7f..f33a7b88c 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from contextlib import contextmanager
 from typing import Any, Dict, List, Literal, Optional, Set, Type, Union
 
diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py
index 7278c7fbe..763615217 100644
--- a/vllm/model_executor/__init__.py
+++ b/vllm/model_executor/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            PackedvLLMParameter)
 from vllm.model_executor.sampling_metadata import (SamplingMetadata,
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 96995c56b..ee4f41ea6 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, Type
 
 import torch.nn as nn
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 18b435a42..cf96461a5 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py
index 8deb4c949..db4ce2680 100644
--- a/vllm/model_executor/guided_decoding/guided_fields.py
+++ b/vllm/model_executor/guided_decoding/guided_fields.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Dict, List, Optional, TypedDict, Union
 
diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
index a17e75a80..7eaf9e38e 100644
--- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
+++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import lru_cache
 from json import loads as json_loads
 from typing import Optional, Union
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index eb8db8824..ba9c98290 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import concurrent.futures
 import os
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index e4eb3f16e..ab72b55a8 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024- the Outlines developers
 # This file is adapted from
 # https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py
diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
index 90dfa62ec..87ef45358 100644
--- a/vllm/model_executor/guided_decoding/utils.py
+++ b/vllm/model_executor/guided_decoding/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import re
 
 
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index ee30ce96f..c01bd3af1 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # noqa: UP007
 from __future__ import annotations
 
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index fb9684ac1..f782920d0 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Custom activation functions."""
 import math
 from typing import Optional
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index c4223d126..6f933c3fa 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from contextlib import contextmanager
 from typing import Any, Dict, Optional
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 87993267c..4ca569ca4 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Fused MoE utilities for GPTQ."""
 import functools
 from typing import Optional
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index c80e6bf07..9613696a0 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Fused MoE kernel."""
 import functools
 import json
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index da0ce1885..3c7ef5e00 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import abstractmethod
 from enum import Enum
 from typing import Callable, List, Optional, Tuple
diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
index 563ee18c6..0365afa10 100644
--- a/vllm/model_executor/layers/fused_moe/moe_pallas.py
+++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 import torch.nn.functional as F
 from torch_xla.experimental.custom_kernel import _histogram
diff --git a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
index bcff55f4f..d9a5de1b3 100644
--- a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
+++ b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 import torch.nn.functional as F
 
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 43ea4eb5a..b476fb0db 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Custom normalization layers."""
 from typing import Optional, Tuple, Union
 
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 52263e96f..08f1e103e 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 from abc import abstractmethod
 from typing import Dict, List, Optional, Tuple
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 42decde1d..ebf74c67d 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A layer that compute logits from hidden_stats."""
 import inspect
 from typing import Optional
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 606c796d5..93c3cc91b 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 from torch import nn
 from torch.nn.parameter import Parameter
diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
index be5639df9..21e27160f 100644
--- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright (c) 2024, Tri Dao.
 # Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py
 
diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
index 1484b7981..3c35f1ac0 100644
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/triton/selective_state_update.py
 
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 75bf33dc7..0012636ef 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from enum import IntEnum
 from typing import List, Optional, Union
 
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index bd0fd4799..6ded3874f 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Type
 
 from vllm.model_executor.layers.quantization.base_config import (
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 72c89fe2b..6c08d016c 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Supports AQLM compression, see https://github.com/Vahe1994/AQLM
 # and https://arxiv.org/pdf/2401.06118.pdf
 
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index d83528e9e..ff77af44d 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 0c3c98168..8849ba292 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py
index ace8f4a34..09efd4dbd 100644
--- a/vllm/model_executor/layers/quantization/awq_triton.py
+++ b/vllm/model_executor/layers/quantization/awq_triton.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 import triton
 import triton.language as tl
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 2fb2642dd..2eefcc4f3 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import inspect
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional, Type
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 5dc872933..889eda009 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 37981ed91..24f7542e1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from contextlib import suppress
 from typing import Any, Dict, List, Literal, Optional, Tuple, cast
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index e1c45f4e4..db8e8a4b6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 from enum import Enum
 from typing import Callable, List, Optional
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 569ecaa6f..b26c74f24 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .compressed_tensors_scheme import CompressedTensorsScheme
 from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
                                           CompressedTensorsW4A16Sparse24)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 21e6fe7a2..84f924b23 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
index b4bab33e1..daa25d23a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index 2e1b5e3c2..535ea6b32 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
index 1671a23d7..5c8261908 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 1d4e4bd52..5dcc41a9e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 0e3f47317..08d86a4e5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional, Set
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 2dd243b9c..38df09ff3 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional, Set
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
index f4c1dbc03..b69c5e7a0 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Type
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 34996b08e..d700a0b15 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import re
 from typing import Iterable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py
index 36598b3e2..b41236501 100644
--- a/vllm/model_executor/layers/quantization/deepspeedfp.py
+++ b/vllm/model_executor/layers/quantization/deepspeedfp.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 100cbfa4c..87fbcf62a 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 7b71e13b5..da5ef36c5 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index adab1973b..86e025310 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index f0943efa0..86e6dbb5a 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import gguf
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index abafad0f1..0cb77a754 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 from enum import Enum
 from fractions import Fraction
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 4dc4b052b..99ab29995 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Callable, Dict, List, Optional, Set, Union
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
index 07552c0f1..cec984483 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
index 28538d299..432f43688 100644
--- a/vllm/model_executor/layers/quantization/hqq_marlin.py
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index c16a96213..2531170ec 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
index 915bdc477..c06befaf3 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Callable, Optional, Tuple
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index 83549870e..bcfdb1677 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Type
 
 import vllm.envs as envs
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
index 1d85d62ec..2706fbb53 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Tuple
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
index 15df0200f..3f0586f6e 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import partial
 from typing import Optional, Tuple
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index 6969583d6..e21801cf6 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Tuple
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
index c4a83b4fa..91e765405 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Optional, Tuple
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index 4824a1180..a5967995a 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import Dict, List, Optional, Type
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
index 2e83a0428..2bf21a05c 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Tuple
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
index 97ec8cb05..5da5df8ef 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Tuple
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
index 9de668e65..0bf090d7f 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import warnings
 from typing import Optional, Tuple
 
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index e1870c73c..388a4f166 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from vllm.logger import init_logger
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index 20212e672..4cf0c677c 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index a1b3eeb43..348e9bccd 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 11a9d4ac5..1ae765a22 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py
index 2d5cdfa16..a8e8be207 100644
--- a/vllm/model_executor/layers/quantization/neuron_quant.py
+++ b/vllm/model_executor/layers/quantization/neuron_quant.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from importlib.util import find_spec
 from typing import Any, Dict, List, Optional
diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py
index 2ccd08202..6e9d3dc6c 100644
--- a/vllm/model_executor/layers/quantization/qqq.py
+++ b/vllm/model_executor/layers/quantization/qqq.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index fc214255e..0451cf82b 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import fnmatch
 import re
 from typing import Any, Dict, List, Optional, cast
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 68a395454..98743b15e 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Callable, Dict, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
index fb0ba9bd5..9069b5a0d 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .quark_scheme import QuarkScheme
 from .quark_w8a8_fp8 import QuarkW8A8Fp8
 from .quark_w8a8_int8 import QuarkW8A8Int8
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
index 239597fa4..40c8ea86d 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
index 206931ea2..c885e98a4 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
index 8cb47e9c3..1bf34b098 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional, Set
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py
index 742a629bd..afb1d9d63 100644
--- a/vllm/model_executor/layers/quantization/quark/utils.py
+++ b/vllm/model_executor/layers/quantization/quark/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import re
 from typing import Any, Iterable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/schema.py b/vllm/model_executor/layers/quantization/schema.py
index a26c52478..026881f2d 100644
--- a/vllm/model_executor/layers/quantization/schema.py
+++ b/vllm/model_executor/layers/quantization/schema.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This file contains the Pydantic schemas for various quantization-related
 parameters. When a relevant quantization technique is specified, these
diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py
index 605c3a386..3234fecaa 100644
--- a/vllm/model_executor/layers/quantization/tpu_int8.py
+++ b/vllm/model_executor/layers/quantization/tpu_int8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional, Tuple
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/utils/__init__.py b/vllm/model_executor/layers/quantization/utils/__init__.py
index e60f0c79a..f7ee47288 100644
--- a/vllm/model_executor/layers/quantization/utils/__init__.py
+++ b/vllm/model_executor/layers/quantization/utils/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .layer_utils import replace_parameter, update_tensor_inplace
 
 __all__ = ['update_tensor_inplace', 'replace_parameter']
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 850820f66..29c7268ad 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from https://github.com/sgl-project/sglang/pull/2575
 import functools
 import json
diff --git a/vllm/model_executor/layers/quantization/utils/layer_utils.py b/vllm/model_executor/layers/quantization/utils/layer_utils.py
index edce6d19b..5acae7ca3 100644
--- a/vllm/model_executor/layers/quantization/utils/layer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/layer_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Union
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py
index 18e133205..cb7d49ed6 100644
--- a/vllm/model_executor/layers/quantization/utils/machete_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/machete_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index c9366ca97..3beba3083 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple
 
 import numpy
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 245fe9238..6120a8e66 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index 4a06c5d63..fb557a313 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Utility functions used for tests and benchmarks"""
 
 from typing import List, Optional
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
index 17d09055b..3654268e2 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Utility functions used for tests and benchmarks"""
 
 import random
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
index cb58eb945..176b2947a 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import numpy
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 95e785dcc..62484f62f 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """This file is used for /tests and /benchmarks"""
 from typing import List, Optional, Tuple
 
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 3af3b3e0e..3fd88e875 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple, Union
 
 import torch
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 9d6c3797c..62e27b714 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import cached_property
 from importlib.util import find_spec
 from typing import Dict, Optional, Tuple
diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py
index a67713c32..4c9860006 100644
--- a/vllm/model_executor/layers/resampler.py
+++ b/vllm/model_executor/layers/resampler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index d071cfe88..814c3b7d9 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 8dc26309d..6af734be5 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A layer that samples the next tokens from the model's outputs."""
 import itertools
 import warnings
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
index 6aa4b8bd3..35c7ffec2 100644
--- a/vllm/model_executor/layers/spec_decode_base_sampler.py
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import abstractmethod
 from typing import Dict, Optional, Union
 
diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py
index 584cf971d..95362c280 100644
--- a/vllm/model_executor/layers/typical_acceptance_sampler.py
+++ b/vllm/model_executor/layers/typical_acceptance_sampler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 import torch.jit
 
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index f6f34cd49..dfe71028c 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Utility methods for model layers."""
 from typing import Tuple
 
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index f230efaca..e409094dd 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import List, Optional, Sequence, Tuple
 
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index 12468997e..9048c70c7 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from torch import nn
 
 from vllm.config import VllmConfig
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 4be511d12..809af81d7 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa: SIM117
 import collections
 import copy
diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py
index a90fbd648..d900fb3a7 100644
--- a/vllm/model_executor/model_loader/neuron.py
+++ b/vllm/model_executor/model_loader/neuron.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Utilities for selecting and loading neuron models."""
 import copy
 import importlib
diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py
index e6299295c..7bd531c56 100644
--- a/vllm/model_executor/model_loader/openvino.py
+++ b/vllm/model_executor/model_loader/openvino.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa: SIM117
 from pathlib import Path
 from typing import List, Optional, Tuple
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 9266ca75d..117251ccf 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import dataclasses
 import io
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 3f923d2f6..084ca53b1 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Utilities for selecting and loading models."""
 import contextlib
 from dataclasses import dataclass, field
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index e4d103f7c..cade0a1dd 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Utilities for downloading and initializing model weights."""
 import fnmatch
 import glob
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index a3ef9adad..6be4a8341 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
                          SupportsPP, has_inner_state, supports_lora,
                          supports_multimodal, supports_pp)
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 55e90b9d4..3e1daa773 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Any, Optional, TypeVar
 
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index fd6b5659d..d015682aa 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Inference-only Snowflake Arctic model."""
 from typing import Iterable, List, Optional, Set, Tuple, Union
 
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 8c6873de1..97502c38b 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
                     Union)
 
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index a923ed36a..5dfaa727b 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 57eb5adc8..204c48d0d 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Derived from BART implementation posted on HuggingFace; license below:
 #
 # coding=utf-8
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 4be136543..4d0f5ac8e 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 987dfaf44..bedbdceb7 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Minimal implementation of BlipVisionModel intended to be only used 
 within a vision language model."""
 from typing import Iterable, Optional, Set, Tuple, Union
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index b559ac677..2b0452222 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import cached_property
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index fee74f491..229677ae7 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index e834c9004..9061a3128 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import cached_property
 from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
                     Tuple, TypedDict, Union)
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index d5f9b4d19..b81a9e917 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/THUDM/CogAgent
 """Inference-only CogAgent model compatible with THUDM weights."""
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index dd69f6c9a..1e784f5b4 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Minimal implementation of CLIPVisionModel intended to be only used
 within a vision language model."""
 from typing import Iterable, List, Optional, Set, Tuple, Union
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 989056bf5..e73627da0 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index b2aa3c070..bb3f4f40d 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
index c55185395..b239b642f 100644
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 DeciAI Research Team. All rights reserved.
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 74b6bfdf2..9599e1df6 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 73388cd26..f5fede4d8 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
index 06ea3dab9..a4829aa1a 100644
--- a/vllm/model_executor/models/deepseek_v3.py
+++ b/vllm/model_executor/models/deepseek_v3.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 344832d8b..1343b9762 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py
 """Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
 import math
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index 948560b49..373a728be 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Iterable, List, Optional, Tuple
 
 import torch
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index bc3295da7..2eb91a682 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py
 # Copyright 2024 The LG U+ CTO AI Tech Lab.
diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py
index b93a68680..310aca999 100644
--- a/vllm/model_executor/models/fairseq2_llama.py
+++ b/vllm/model_executor/models/fairseq2_llama.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved.
 #
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index c503a368e..01b66a1c2 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 3a5fe8e1f..4a1ad5f4e 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from typing import Iterable, List, Optional, Set, Tuple
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index dbf9da50c..6d8c82968 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py
 # Copyright 2023 The vLLM team.
 # Copyright 2023 HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index b23aba829..cb81aa41e 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2023 The vLLM team.
 # Copyright (c) Google Inc.
 #
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index f0dc76939..a6dc8f847 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py
index 942d1e14b..5f1903345 100644
--- a/vllm/model_executor/models/glm.py
+++ b/vllm/model_executor/models/glm.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Inference-only HF format GLM-4 model compatible with THUDM weights."""
 from vllm.config import VllmConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py
index 51922e6f2..4449eb8e8 100644
--- a/vllm/model_executor/models/glm4_vision_encoder.py
+++ b/vllm/model_executor/models/glm4_vision_encoder.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/THUDM/GLM-4
 """Inference-only GLM-4v model visual encoder compatible with THUDM weights."""
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 2f1aa2d68..7ad9a24dc 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index c64bc7068..887a44474 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 08298cc0d..815aba145 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 731642772..550ca3f7c 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 543b4e2f5..85911a0f4 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index cdf9414d5..8ae661bf1 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index d179d6235..7bda54ea7 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from array import array
 from typing import List, Optional, Union
 
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index df7e768fe..91c89b159 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
 # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
 # --------------------------------------------------------
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 4e42a4b6f..f9c2175b2 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py
 # Copyright 2024 The vLLM team.
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index d16a77f86..9e2e677a6 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index c5fd0d933..0fc5c4db1 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional,
                     Protocol, Type, Union, overload, runtime_checkable)
 
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 37b91a803..c5f7be135 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import (TYPE_CHECKING, List, Optional, Protocol, Type, Union,
                     overload, runtime_checkable)
 
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 8ad009d51..0499f339b 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
 # --------------------------------------------------------
 # InternVL
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 28c23edd4..c211ca5f4 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import partial
 from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union
 
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index 93ac2dcf8..106c3b6b7 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple, Union
 
 import torch
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index f4b7e4478..c46a867a7 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
 # --------------------------------------------------------
 # InternVL
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 8c81dff6b..72bcef5e2 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 890b5530b..d82c08152 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Inference-only Jamba model."""
 from typing import Iterable, List, Optional, Set, Tuple
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index e7c264c04..d91c8782a 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 296af2aac..de3777cad 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import abstractmethod
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index fda4f22d3..185edcb8d 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import abstractmethod
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 5be85d7c0..a50025135 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from functools import cached_property
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 5b0f35b08..ac502000c 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 553bc9c28..5034b3345 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """PyTorch MAMBA model."""
 from typing import Iterable, List, Optional, Set, Tuple
 
diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py
index 79393421f..353177f78 100644
--- a/vllm/model_executor/models/mamba_cache.py
+++ b/vllm/model_executor/models/mamba_cache.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Dict, List
 
diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
index 66bdcb89a..a19d7da56 100644
--- a/vllm/model_executor/models/medusa.py
+++ b/vllm/model_executor/models/medusa.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 6254d26c7..29473f5bb 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index 5e1e6c6fa..878f0c895 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2024 The ModelBest team.
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index eb4282d62..f1c168076 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index bf967d33a..6964d6bdc 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index fbb3704fa..70880eb75 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 7a9b8cd88..fdc438917 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index f7f9d7a18..d1cb04cdb 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index f1d796ca2..cf4123a2c 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from typing import Iterable, List, Set, Tuple
 
diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py
index a9102a607..23814e632 100644
--- a/vllm/model_executor/models/module_mapping.py
+++ b/vllm/model_executor/models/module_mapping.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 #  https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py
 
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 5c7ae0dee..b524a1497 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 import re
 from array import array
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 123581641..676c96062 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
 import math
 from typing import Iterable, List, Optional, Set, Tuple, Union
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 2340283b6..6f0b831ac 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index df4fd0a32..2aa04bd71 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py
 # --------------------------------------------------------
 # NVLM-D
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 538e31ec9..3b470dfdd 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py
 # Copyright 2024 The vLLM team.
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index a35c911f9..4b0455098 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py
 # Copyright 2024 The vLLM team.
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index fbe5d1aee..d6e24c6d6 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index ea1185aa8..ad1d66902 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index a3757b5c8..f4f5cdff6 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py
 # Copyright (c) OrionStar Inc.
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 5a28b1ffb..65d810dc2 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 14dd4b5b1..6a80bea34 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py
 # Copyright 2023 The vLLM team.
 # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 59b7508a3..6b05bfee9 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py
index 34141511e..8f84e0726 100644
--- a/vllm/model_executor/models/phi3.py
+++ b/vllm/model_executor/models/phi3.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from llama.py
 """Inference-only Phi3 model code inherit from Llama.py"""
 
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index f47676b93..a8b7e9b2a 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from typing import Iterable, List, Optional, Set, Tuple, Union
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 0fcda81da..f089fa5d2 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 6367b770a..aa4bb52c4 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 37b9989e4..003e9c84c 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from dataclasses import dataclass, fields
 from functools import cached_property
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 86a9d3089..d7f6662bc 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
 # Copyright (c) Alibaba Cloud.
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 82de1c357..e3de6b64f 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
 # Copyright 2024 The Qwen team.
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index fc5aed5c9..cf104ab00 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 95de6c218..35d9854a5 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
 # Copyright 2024 The Qwen team.
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 593ce4857..00e4159e2 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
 # Copyright 2024 The Qwen team.
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index a2778ee73..189ac41e8 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
 # Copyright 2024 The Qwen team.
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index de05bf2b7..40bbc7d16 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Whenever you add an architecture to this page, please also update
 `tests/models/registry.py` with example HuggingFace models for it.
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 5997a7689..742e63a06 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 from typing import Iterable, List, Optional, Tuple
 
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 1e5101897..a81462f6f 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Implementation of SiglipVisionModel intended to be only used
 within a vision language model."""
 
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index e6d919f23..6215ed814 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index c9d1af782..a5d443266 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team.
 # All rights reserved.
 #
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 1cd0dedfe..01ea43666 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index 02ca7fe08..a38035e37 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 605a0ecf4..5e86b15db 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
 import math
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 01a232fdc..fff4be34d 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 from dataclasses import dataclass, field
 from typing import (Callable, Dict, Iterable, List, Literal, Mapping, Optional,
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 57166f05c..0d67ee7bb 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Final, Generic, Optional, Protocol, TypeVar, Union
 
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 15e35fa9c..2319c3160 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
                     Union)
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index a9ce8af15..2b1294bf7 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from fractions import Fraction
 from typing import Callable, Optional, Union
 
diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py
index b86cafce8..dea8b0e9d 100644
--- a/vllm/model_executor/pooling_metadata.py
+++ b/vllm/model_executor/pooling_metadata.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Any, Dict, List, Tuple
 
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 61e8881b6..0a580a4e9 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from array import array
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 6f1cc9d5e..04f922dfd 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Utils for model executor."""
 from typing import Any, Dict, Optional
 
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 1d7f5d57f..741bd1a6a 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .base import MultiModalPlaceholderMap, MultiModalPlugin
 from .hasher import MultiModalHashDict, MultiModalHasher
 from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index de80f22ba..f379ec168 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import base64
 from io import BytesIO
 from pathlib import Path
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index fd3ec7e0e..c48d07ba3 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from pathlib import Path
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 24aa1ca65..7d277fd67 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pickle
 from typing import TYPE_CHECKING, Iterable, Mapping, Optional
 
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index da13a381c..98ac8057e 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import base64
 from functools import lru_cache
 from io import BytesIO
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index b35184f68..eb52551bb 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index ccff0e857..063f458b2 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from collections import UserDict
 from collections.abc import Callable, Iterator, Mapping, Sequence
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 750646ac6..2ad42d1c1 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import re
 from abc import ABC, abstractmethod
 from collections import defaultdict
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index c68edaff8..953c01000 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 7a4b85385..29036691b 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import functools
 from collections import UserDict
 from dataclasses import dataclass
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 900bed592..583f53655 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import lru_cache
 from itertools import groupby
 from pathlib import Path
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 1ad1f5abc..88f184399 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import base64
 from functools import lru_cache, partial
 from io import BytesIO
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 25b226528..786380c37 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from dataclasses import dataclass
 from typing import Dict, Generic, List, MutableSequence, Optional
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index ddbdc43ca..d34b660df 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 import traceback
 from itertools import chain
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 159ea94f9..4e0683b8a 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import TYPE_CHECKING, Optional
 
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 91dcdff00..44d2506f0 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Code inside this file can safely assume cuda platform, e.g. importing
 pynvml. However, it should not initialize cuda context.
 """
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 0e1c4c0c5..78ddb67bb 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import TYPE_CHECKING, Optional
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 186fa54bf..dc6545c93 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 import platform
 import random
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 23a7126fb..5a03f5f7a 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import TYPE_CHECKING, Optional
 
 from vllm.logger import init_logger
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index 3282c0617..41221de0a 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import TYPE_CHECKING, Optional
 
 import torch
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 888852163..cd851c0d8 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from functools import lru_cache
 from typing import TYPE_CHECKING, Dict, List, Optional
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 494a17633..fffc61bba 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import TYPE_CHECKING, Optional
 
 import torch
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 039cdd5ad..81bc85f94 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import TYPE_CHECKING, Optional
 
 import torch
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index a78a05491..389cb8728 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 import os
 from typing import Callable, Dict
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index b24b7e91a..061232eb1 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Optional
 
 import msgspec
diff --git a/vllm/profiler/__init__.py b/vllm/profiler/__init__.py
index 3e25f5cc2..00af72b1d 100644
--- a/vllm/profiler/__init__.py
+++ b/vllm/profiler/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .layerwise_profile import layerwise_profile
 
 __all__ = [
diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py
index 29c0edd0e..6351ef63d 100644
--- a/vllm/profiler/layerwise_profile.py
+++ b/vllm/profiler/layerwise_profile.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import copy
 from collections import defaultdict
 from dataclasses import asdict, dataclass, field
diff --git a/vllm/profiler/utils.py b/vllm/profiler/utils.py
index 033035e43..62b39f510 100644
--- a/vllm/profiler/utils.py
+++ b/vllm/profiler/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import Callable, Dict, List, Type, Union
 
diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py
index 27a61e692..c2f9f1691 100644
--- a/vllm/prompt_adapter/layers.py
+++ b/vllm/prompt_adapter/layers.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Optional
 
diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py
index 18a5f86c3..3ba7d0896 100644
--- a/vllm/prompt_adapter/models.py
+++ b/vllm/prompt_adapter/models.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 import math
 from typing import Any, Callable, Dict, List, Optional, Type
diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py
index 775dd11db..dfb8e61d7 100644
--- a/vllm/prompt_adapter/request.py
+++ b/vllm/prompt_adapter/request.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import msgspec
 
 from vllm.adapter_commons.request import AdapterRequest
diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py
index 8b2732923..dd179ab93 100644
--- a/vllm/prompt_adapter/utils.py
+++ b/vllm/prompt_adapter/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420
 
 import os
diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py
index ddc1ef893..28dcc1687 100644
--- a/vllm/prompt_adapter/worker_manager.py
+++ b/vllm/prompt_adapter/worker_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 from typing import Any, Optional, Set, Type
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 605c09b8d..97f9e2129 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Sampling parameters for text generation."""
 import copy
 from dataclasses import dataclass
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
index 20063a5b4..9f6e85920 100644
--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import functools
 import struct
 from dataclasses import dataclass
diff --git a/vllm/scripts.py b/vllm/scripts.py
index 8101e6b3a..467cab28f 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # The CLI entrypoint to vLLM.
 import argparse
 import os
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 74320db70..534b9e606 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Sequence and its related classes."""
 import copy
 import enum
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 56fb9ba50..e08ed742a 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from array import array
 from itertools import chain, count
 from typing import Iterator, List, Optional, Tuple
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index fe5fd39f4..3948298db 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional
 
 import torch
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index c39e98b6c..dd085ad77 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import List, Optional, Set, Union
diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py
index 21a58fc42..0b62a988e 100644
--- a/vllm/spec_decode/medusa_worker.py
+++ b/vllm/spec_decode/medusa_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import weakref
 from typing import List, Optional, Set, Tuple
 
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index d678f4578..bc0e0a121 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from typing import Callable, Optional, Union
 
diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py
index fc41bb82e..bdaf31895 100644
--- a/vllm/spec_decode/mlp_speculator_worker.py
+++ b/vllm/spec_decode/mlp_speculator_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Set, Tuple
 
 import torch
diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py
index 3aea2eabb..6275c460e 100644
--- a/vllm/spec_decode/mqa_scorer.py
+++ b/vllm/spec_decode/mqa_scorer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.sequence import (ExecuteModelRequest, SequenceData,
                            SequenceGroupMetadata, get_all_seq_ids)
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index 32197f8cc..5474917a6 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import copy
 import weakref
 from typing import Dict, List, Set, Tuple
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index e906b1789..86390c99c 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import weakref
 from typing import List, Optional, Set, Tuple
 
diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py
index 28a537593..2bebf80fa 100644
--- a/vllm/spec_decode/proposer_worker_base.py
+++ b/vllm/spec_decode/proposer_worker_base.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import List, Optional, Set, Tuple
 
diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py
index c6ff5e52f..a1466ba5d 100644
--- a/vllm/spec_decode/smaller_tp_proposer_worker.py
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Set, Tuple
 
 import torch
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 8d6d05cba..8653bece8 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import copy
 from collections import defaultdict
 from functools import cached_property
diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py
index 56540744b..08e773c56 100644
--- a/vllm/spec_decode/target_model_runner.py
+++ b/vllm/spec_decode/target_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional
 
 from vllm.sequence import SequenceGroupMetadata
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index 6bf7587cd..b538923c0 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Set, Tuple
 
 import torch
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index c88820ab2..9c04680a6 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from contextlib import contextmanager
 from typing import Dict, List, Optional, Sequence, Tuple
diff --git a/vllm/tracing.py b/vllm/tracing.py
index 72a3f8511..bf069ad84 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import Mapping, Optional
 
diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py
index eeec029fc..01d5bb4b5 100644
--- a/vllm/transformers_utils/__init__.py
+++ b/vllm/transformers_utils/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.envs import VLLM_USE_MODELSCOPE
 
 if VLLM_USE_MODELSCOPE:
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 5805f4ad0..1c0f20a6e 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 import json
 import os
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index f065c5612..c484a755a 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.cohere2 import Cohere2Config
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py
index 7780bf5e7..6625ccf0f 100644
--- a/vllm/transformers_utils/configs/arctic.py
+++ b/vllm/transformers_utils/configs/arctic.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py
index e563bf626..43e9503ff 100644
--- a/vllm/transformers_utils/configs/chatglm.py
+++ b/vllm/transformers_utils/configs/chatglm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/THUDM/ChatGLM2-6B
 from transformers import PretrainedConfig
diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py
index 1509330fc..e30409b3a 100644
--- a/vllm/transformers_utils/configs/cohere2.py
+++ b/vllm/transformers_utils/configs/cohere2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa
 
 # Adapted from
diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py
index 0dc966472..8f40b2b7d 100644
--- a/vllm/transformers_utils/configs/dbrx.py
+++ b/vllm/transformers_utils/configs/dbrx.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py
index 681528c3c..24d4052d8 100644
--- a/vllm/transformers_utils/configs/deepseek_vl2.py
+++ b/vllm/transformers_utils/configs/deepseek_vl2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
 from typing import Tuple
 
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index b357a785e..b26aba666 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import Optional, Union
 
diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py
index f60a59f55..39364367e 100644
--- a/vllm/transformers_utils/configs/exaone.py
+++ b/vllm/transformers_utils/configs/exaone.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copied from
 # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
 # Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
diff --git a/vllm/transformers_utils/configs/falcon.py b/vllm/transformers_utils/configs/falcon.py
index c82cc6065..f161a06f3 100644
--- a/vllm/transformers_utils/configs/falcon.py
+++ b/vllm/transformers_utils/configs/falcon.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py
index b94c5b77e..48b5d79ff 100644
--- a/vllm/transformers_utils/configs/h2ovl.py
+++ b/vllm/transformers_utils/configs/h2ovl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
 # --------------------------------------------------------
diff --git a/vllm/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py
index ac2492317..8ea62546e 100644
--- a/vllm/transformers_utils/configs/internvl.py
+++ b/vllm/transformers_utils/configs/internvl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py
 # --------------------------------------------------------
diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py
index 82f129eb2..0cab2c42e 100644
--- a/vllm/transformers_utils/configs/jais.py
+++ b/vllm/transformers_utils/configs/jais.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 # Copyright 2023 Cerebras Systems.
diff --git a/vllm/transformers_utils/configs/medusa.py b/vllm/transformers_utils/configs/medusa.py
index d71a08343..885713c5d 100644
--- a/vllm/transformers_utils/configs/medusa.py
+++ b/vllm/transformers_utils/configs/medusa.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import Optional, Union
 
diff --git a/vllm/transformers_utils/configs/mllama.py b/vllm/transformers_utils/configs/mllama.py
index 49e766d7f..eb77e09ad 100644
--- a/vllm/transformers_utils/configs/mllama.py
+++ b/vllm/transformers_utils/configs/mllama.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from transformers.models.mllama import configuration_mllama as mllama_hf_config
 
 
diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py
index 946af4e91..c761f659e 100644
--- a/vllm/transformers_utils/configs/mlp_speculator.py
+++ b/vllm/transformers_utils/configs/mlp_speculator.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional
 
 from transformers import PretrainedConfig
diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py
index 0f047c8b0..96356135f 100644
--- a/vllm/transformers_utils/configs/mpt.py
+++ b/vllm/transformers_utils/configs/mpt.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copied from
 # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
 """A HuggingFace-style model configuration."""
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index 1edf36329..fdf4fa2a5 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 #
diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py
index 8007176ae..300f6e211 100644
--- a/vllm/transformers_utils/configs/nvlm_d.py
+++ b/vllm/transformers_utils/configs/nvlm_d.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
 # --------------------------------------------------------
diff --git a/vllm/transformers_utils/configs/olmo2.py b/vllm/transformers_utils/configs/olmo2.py
index 0e6d8e487..c6e446333 100644
--- a/vllm/transformers_utils/configs/olmo2.py
+++ b/vllm/transformers_utils/configs/olmo2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py
index 0c1c048f6..0d5db896b 100644
--- a/vllm/transformers_utils/configs/solar.py
+++ b/vllm/transformers_utils/configs/solar.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py
index eb6f5a059..5da6c5b44 100644
--- a/vllm/transformers_utils/configs/telechat2.py
+++ b/vllm/transformers_utils/configs/telechat2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py
 """ Telechat configuration compatible with LlamaConfig. """
 
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index f724bf7f2..99715ba6d 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
 from typing import Any, Dict, Optional
 
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index 7c8423d2b..9d1d4bb92 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Optional
 
 from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams,
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 37ff8a236..8160a35ff 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple
 
 from .tokenizer import AnyTokenizer
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index b12cc83a2..3197b07d8 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import lru_cache
 from typing import Any, cast
 
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index 9c71b8cad..4696f0c49 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.transformers_utils.processors.deepseek_vl2 import (
     DeepseekVLV2Processor)
 
diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py
index 27cdf6bc2..d37381ea9 100644
--- a/vllm/transformers_utils/processors/deepseek_vl2.py
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index 74a56cbf5..4fe744d28 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import fnmatch
 import os
 import shutil
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 1f1d67fab..520870b56 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import contextlib
 import os
 import warnings
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index 09569c564..c223768b1 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Type
 
 from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
index e6cc7cd4e..fbdfa3e57 100644
--- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import List, Optional
 
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index 3f7627e11..30cab752c 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 from typing import List, Optional
diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
index 6dc2f9056..025971cb7 100644
--- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional
 
 from vllm.config import TokenizerPoolConfig
diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
index e68ad79b2..2b64f3fc7 100644
--- a/vllm/transformers_utils/tokenizers/__init__.py
+++ b/vllm/transformers_utils/tokenizers/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .mistral import MistralTokenizer, maybe_serialize_tool_calls
 
 __all__ = ["MistralTokenizer", "maybe_serialize_tool_calls"]
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index d801cf4e4..cecafcc78 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import re
 from dataclasses import dataclass
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 10a09fb4f..71fe3ef0b 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from os import PathLike
 from pathlib import Path
 from typing import Union
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 568185383..c8f7a32ce 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.triton_utils.importing import HAS_TRITON
 
 __all__ = ["HAS_TRITON"]
diff --git a/vllm/triton_utils/custom_cache_manager.py b/vllm/triton_utils/custom_cache_manager.py
index 17039d7ba..4163969c9 100644
--- a/vllm/triton_utils/custom_cache_manager.py
+++ b/vllm/triton_utils/custom_cache_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 from triton.runtime.cache import (FileCacheManager, default_cache_dir,
diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
index 0c96e0632..a20700248 100644
--- a/vllm/triton_utils/importing.py
+++ b/vllm/triton_utils/importing.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from importlib.util import find_spec
 
 from vllm.logger import init_logger
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 7f5cc9063..fbbb21c89 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import datetime
 import json
 import logging
diff --git a/vllm/utils.py b/vllm/utils.py
index 15481fb06..3089f0951 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import asyncio
 import concurrent
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index ce83b1fac..837d7faf4 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Attention layer with FlashAttention."""
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 9d570b334..651bc01aa 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import TYPE_CHECKING, Dict, List, Set, Tuple
 
 from vllm.logger import init_logger
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 7176ec954..94086e4a1 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import defaultdict
 from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple
 
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 2b6557ad3..c801ab9e4 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """KV-Cache Utilities."""
 from collections.abc import Sequence
 from dataclasses import dataclass
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 27c9ac1ae..f4738bb33 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import deque
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set,
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index abe4952c4..912b92862 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, List, Optional, Union
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b9dc3561d..3c4e35e4a 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 from typing import AsyncGenerator, List, Mapping, Optional, Type, Union
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f50303bda..29a9ac186 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pickle
 import queue
 import signal
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index f3b992d68..247380ef7 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 import signal
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 4a8b61bee..6d800f026 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import List, Optional, Union
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 55d314ebe..e0452bcad 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Mapping, Optional, Type, Union
 
 from typing_extensions import TypeVar
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index d83460a40..83a0d9db1 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 from vllm.config import ModelConfig
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 234ef8194..aeefd5239 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from dataclasses import dataclass
 from typing import Dict, List, Optional
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 6196c1105..366287951 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from typing import Mapping, Optional, Union
 
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 131be7598..ac10d43eb 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Type
 
 from vllm.config import VllmConfig
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index f6cf35da0..e3f07172d 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import pickle
 import signal
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 6d5cc32ff..eddfb5949 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Dict, List
 
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index f901822c7..f736e38f1 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from abc import ABC, abstractmethod
 from typing import List
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 527750512..88f2c0835 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, List
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 32aee44e3..6e82bffd7 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Dict, List, Optional
 
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 80160c673..0519d9e78 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 from typing import TYPE_CHECKING, List, Optional, Union
 
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index d60f7eb5d..8e54de345 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Set
 
diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py
index 2796d0494..ba368b44a 100644
--- a/vllm/v1/sample/ops/penalties.py
+++ b/vllm/v1/sample/ops/penalties.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Set, Tuple
 
 import torch
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index f2007d85c..27431001e 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict
 
 import torch
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 9ad665a64..3da7498e0 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A layer that samples the next tokens from the model's outputs."""
 from typing import Tuple
 
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index b1cd5c118..1791dfa2b 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pickle
 
 
diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py
index 902800e05..09d382638 100644
--- a/vllm/v1/stats/common.py
+++ b/vllm/v1/stats/common.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from dataclasses import dataclass
 from dataclasses import field as dataclass_field
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 8dfcf2dd7..5494542c1 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import multiprocessing
 import os
 import weakref
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 26a2084b1..8d0785243 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import numpy as np
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 28d8e3905..39708f833 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Datastructures defining an input batch
 
 from dataclasses import dataclass
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a00c00c30..0b5644525 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import gc
 import time
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, cast
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index a8cf0aec3..0adb69073 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A GPU worker class."""
 import gc
 import os
diff --git a/vllm/version.py b/vllm/version.py
index 66e189dce..70cd0289b 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 try:
     from ._version import __version__, __version_tuple__
 except Exception as e:
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index c427b759b..252fe0660 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """CacheEngine class for managing the KV cache."""
 from typing import List
 
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index fa6775cbd..71e32c5f7 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast
 
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 4b429b67b..1c3feece9 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 import weakref
 from collections import defaultdict
diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py
index d31ba89e1..c0744d63b 100644
--- a/vllm/worker/cpu_pooling_model_runner.py
+++ b/vllm/worker/cpu_pooling_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 3e5fcf11b..27b1a2dd1 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A CPU worker class."""
 from typing import Dict, List, Optional, Set, Tuple, Type
 
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 8a161b740..e2d338f75 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 import itertools
 from typing import Any, Dict, List, Optional, Tuple, Type, cast
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index a339c97a8..b846d4387 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index aaf9cb40b..a1f31bead 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 322d91d62..90f08b1df 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 import gc
 import inspect
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index aef4bdcdd..9e33ef9f1 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 import pickle
 from abc import ABC, abstractmethod
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 4aab09c80..90771e8ac 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 import functools
 from dataclasses import dataclass, field
diff --git a/vllm/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py
index e654f7172..387119998 100644
--- a/vllm/worker/multi_step_tpu_worker.py
+++ b/vllm/worker/multi_step_tpu_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import Dict, Optional, Tuple
 
diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py
index 1f982fe10..3518ab2f6 100644
--- a/vllm/worker/multi_step_worker.py
+++ b/vllm/worker/multi_step_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 596c26eac..f2093fc42 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from dataclasses import dataclass
 from importlib.util import find_spec
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index e02c72faa..5f0eb0019 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A Neuron worker class."""
 from typing import List, Optional, Tuple
 
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 42fe2cf66..44442cddb 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import defaultdict
 from typing import Dict, List, NamedTuple, Optional, Tuple
 
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index f5b46cde3..0690222d9 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """An OpenVINO worker class."""
 from typing import Any, Dict, List, Optional, Tuple
 
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index 6de227f3c..f43085b0e 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 874951828..ecdf7aa88 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 import time
 from dataclasses import dataclass
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index ea0e70054..12f10169f 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import List, Optional, Tuple, Union
 
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
index ffa8c4cb0..d925f0883 100644
--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 '''
 Worker-related helper functions.
 '''
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 1d2884d3d..582aa460e 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A GPU worker class."""
 import gc
 import os
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 6eeb4aa17..819b81fbf 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 import os
 import time
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index b7b7b7227..9c726e1a1 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 import time
 import weakref
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index e9cb623c8..047c0bbbc 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A XPU worker class."""
 import gc
 import os
-- 
GitLab


From e64330910b4e503e1a672a73c2bfbfc9b7305b86 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 3 Feb 2025 09:32:18 +0800
Subject: [PATCH 09/65] [doc][misc] clarify VLLM_HOST_IP for multi-node
 inference (#12667)

As more and more people are trying deepseek models with multi-node
inference, https://github.com/vllm-project/vllm/issues/7815 becomes more
frequent. Let's give clear message to users.

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/serving/distributed_serving.md | 12 +++++++++---
 vllm/executor/ray_utils.py                 |  5 ++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index 3f9ca27eb..6d136147c 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -60,7 +60,8 @@ bash run_cluster.sh \
                 vllm/vllm-openai \
                 ip_of_head_node \
                 --head \
-                /path/to/the/huggingface/home/in/this/node
+                /path/to/the/huggingface/home/in/this/node \
+                -e VLLM_HOST_IP=ip_of_this_node
 ```
 
 On the rest of the worker nodes, run the following command:
@@ -70,10 +71,11 @@ bash run_cluster.sh \
                 vllm/vllm-openai \
                 ip_of_head_node \
                 --worker \
-                /path/to/the/huggingface/home/in/this/node
+                /path/to/the/huggingface/home/in/this/node \
+                -e VLLM_HOST_IP=ip_of_this_node
 ```
 
-Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct.
+Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. The IP addresses of each worker node should be specified in the `VLLM_HOST_IP` environment variable, and should be different for each worker node. Please check the network configuration of your cluster to make sure the nodes can communicate with each other through the specified IP addresses.
 
 Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
 
@@ -103,3 +105,7 @@ Please make sure you downloaded the model to all the nodes (with the same path),
 
 When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model.
 :::
+
+:::{warning}
+If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` to see the IP address used by Ray. See <gh-issue:7815> for more information.
+:::
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 5d5cc8398..7b3015597 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -214,7 +214,10 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
         logger.info(
             "Waiting for creating a placement group of specs for "
             "%d seconds. specs=%s. Check "
-            "`ray status` to see if you have enough resources.",
+            "`ray status` to see if you have enough resources,"
+            " and make sure the IP addresses used by ray cluster"
+            " are the same as VLLM_HOST_IP environment variable"
+            " specified in each node if you are running on a multi-node.",
             int(time.time() - s), placement_group_specs)
 
     try:
-- 
GitLab


From 326fcc8b9f1d75a9f194581c98a7994d220c5255 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Sun, 2 Feb 2025 19:19:56 -0800
Subject: [PATCH 10/65] [Doc] Deprecate Discord (#12668)

---
 CODE_OF_CONDUCT.md | 2 +-
 README.md          | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index f801b5f8f..1a9596841 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -61,7 +61,7 @@ representative at an online or offline/IRL event.
 
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement in the #code-of-conduct
-channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
+channel in the [vLLM Slack](https://slack.vllm.ai).
 All complaints will be reviewed and investigated promptly and fairly.
 
 All community leaders are obligated to respect the privacy and security of the
diff --git a/README.md b/README.md
index 80c3ba7d1..09c2c6d35 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 
 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 
 ---
@@ -139,8 +139,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 ## Contact Us
 
 * For technical questions and feature requests, please use Github issues or discussions.
-* For discussing with fellow users, please use Discord.
-* For coordinating contributions and development, please use Slack.
+* For discussing with fellow users and coordinating contributions and development, please use Slack.
 * For security disclosures, please use Github's security advisory feature.
 * For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
 
-- 
GitLab


From 95460fc51318702a33226b87152afe810187e01e Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.utah@gmail.com>
Date: Sun, 2 Feb 2025 21:09:50 -0800
Subject: [PATCH 11/65] [Kernel] port sgl moe_align_block_size kernels (#12574)

sgl_moe_align_block_size is based on:


https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a

moe_align_block_size is based on:


https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0

Signed-off-by: Yang Chen <yangche@fb.com>
---
 csrc/moe/moe_align_sum_kernels.cu             |  92 ++++++++++
 csrc/moe/moe_ops.h                            |   6 +
 csrc/moe/torch_bindings.cpp                   |   9 +
 vllm/_custom_ops.py                           |   9 +
 vllm/envs.py                                  |   9 +-
 .../layers/fused_moe/fused_moe.py             | 162 +++++++++++++++++-
 6 files changed, 284 insertions(+), 3 deletions(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index 8b6fe72ad..ff74a42d7 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -197,6 +197,72 @@ __global__ void moe_align_block_size_global_mem_kernel(
   }
 }
 
+// taken from
+// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
+template <typename scalar_t>
+__global__ void sgl_moe_align_block_size_kernel(
+    scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
+    int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
+    int32_t block_size, size_t numel, int32_t* cumsum) {
+  __shared__ int32_t shared_counts[32][8];
+  __shared__ int32_t local_offsets[256];
+
+  const int warp_id = threadIdx.x / WARP_SIZE;
+  const int lane_id = threadIdx.x % WARP_SIZE;
+  const int experts_per_warp = 8;
+  const int my_expert_start = warp_id * experts_per_warp;
+
+  for (int i = 0; i < experts_per_warp; ++i) {
+    if (my_expert_start + i < num_experts) {
+      shared_counts[warp_id][i] = 0;
+    }
+  }
+
+  const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
+  const size_t start_idx = threadIdx.x * tokens_per_thread;
+
+  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+    int expert_id = topk_ids[i];
+    int warp_idx = expert_id / experts_per_warp;
+    int expert_offset = expert_id % experts_per_warp;
+    atomicAdd(&shared_counts[warp_idx][expert_offset], 1);
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    cumsum[0] = 0;
+    for (int i = 1; i <= num_experts; ++i) {
+      int expert_count = 0;
+      int warp_idx = (i - 1) / experts_per_warp;
+      int expert_offset = (i - 1) % experts_per_warp;
+      expert_count = shared_counts[warp_idx][expert_offset];
+
+      cumsum[i] =
+          cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size;
+    }
+    *total_tokens_post_pad = cumsum[num_experts];
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x < num_experts) {
+    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
+         i += block_size) {
+      expert_ids[i / block_size] = threadIdx.x;
+    }
+    local_offsets[threadIdx.x] = cumsum[threadIdx.x];
+  }
+
+  __syncthreads();
+
+  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+    int32_t expert_id = topk_ids[i];
+    int32_t rank_post_pad = atomicAdd(&local_offsets[expert_id], 1);
+    sorted_token_ids[rank_post_pad] = i;
+  }
+}
+
 template <typename scalar_t, int TOPK>
 __global__ void moe_sum_kernel(
     scalar_t* __restrict__ out,          // [..., d]
@@ -305,6 +371,32 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
   }
 }
 
+void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
+                              int64_t block_size,
+                              torch::Tensor sorted_token_ids,
+                              torch::Tensor experts_ids,
+                              torch::Tensor num_tokens_post_pad) {
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_INTEGRAL_TYPES(
+      topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] {
+        // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
+        // tensors
+        auto options_int =
+            torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
+        // torch::Tensor token_cnts_buffer =
+        //     torch::empty({(num_experts + 1) * num_experts}, options_int);
+        torch::Tensor cumsum_buffer =
+            torch::empty({num_experts + 1}, options_int);
+
+        auto kernel = vllm::moe::sgl_moe_align_block_size_kernel<scalar_t>;
+        kernel<<<1, 1024, 0, stream>>>(
+            topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
+            experts_ids.data_ptr<int32_t>(),
+            num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
+            topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
+      });
+}
+
 void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
              torch::Tensor& output)  // [num_tokens, hidden_size]
 {
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 596cc0aa6..66bb5f41b 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -12,3 +12,9 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           int64_t block_size, torch::Tensor sorted_token_ids,
                           torch::Tensor experts_ids,
                           torch::Tensor num_tokens_post_pad);
+
+void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
+                              int64_t block_size,
+                              torch::Tensor sorted_token_ids,
+                              torch::Tensor experts_ids,
+                              torch::Tensor num_tokens_post_pad);
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index f3a558c14..8540633dc 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -22,6 +22,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "                     Tensor! num_tokens_post_pad) -> ()");
   m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
 
+  // temporarily adapted from
+  // https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
+  m.def(
+      "sgl_moe_align_block_size(Tensor topk_ids, int num_experts,"
+      "                         int block_size, Tensor! sorted_token_ids,"
+      "                         Tensor! experts_ids,"
+      "                         Tensor! num_tokens_post_pad) -> ()");
+  m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size);
+
 #ifndef USE_ROCM
   m.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index ce4f75341..bdc9a6a33 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -952,6 +952,15 @@ def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
                                           num_tokens_post_pad)
 
 
+def sgl_moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
+                             block_size: int, sorted_token_ids: torch.Tensor,
+                             experts_ids: torch.Tensor,
+                             num_tokens_post_pad: torch.Tensor) -> None:
+    torch.ops._moe_C.sgl_moe_align_block_size(topk_ids, num_experts,
+                                              block_size, sorted_token_ids,
+                                              experts_ids, num_tokens_post_pad)
+
+
 def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                  token_expert_indicies: torch.Tensor,
                  gating_output: float) -> None:
diff --git a/vllm/envs.py b/vllm/envs.py
index 78ee3047b..5018f6deb 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -82,6 +82,7 @@ if TYPE_CHECKING:
     VLLM_MLA_DISABLE: bool = False
     VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
     VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
+    VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
 
 
 def get_default_cache_root():
@@ -531,7 +532,13 @@ environment_variables: Dict[str, Callable[[], Any]] = {
     # matrices to match the activation type. This can lead to higher memory and
     # compute usage but better preserves the accuracy of the original model.
     "VLLM_MLA_DISABLE_REQUANTIZATION":
-    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0")))
+    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))),
+
+    # If set, vLLM will use the Triton implementation of moe_align_block_size,
+    # i.e. moe_align_block_size_triton in fused_moe.py.
+    "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
+                 ),
 }
 
 # end-env-vars-definition
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 9613696a0..1bed35525 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -405,6 +405,144 @@ def fused_moe_kernel(
     tl.store(c_ptrs, accumulator, mask=c_mask)
 
 
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+@triton.jit
+def moe_align_block_size_stage1(
+    topk_ids_ptr,
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    start_idx = pid * tokens_per_thread
+
+    off_c = (pid + 1) * num_experts
+
+    for i in range(tokens_per_thread):
+        if start_idx + i < numel:
+            idx = tl.load(topk_ids_ptr + start_idx + i)
+            token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
+            tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
+
+
+@triton.jit
+def moe_align_block_size_stage2(
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    last_cnt = 0
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
+        last_cnt = last_cnt + token_cnt
+        tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
+
+
+@triton.jit
+def moe_align_block_size_stage3(
+    total_tokens_post_pad_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+):
+    last_cumsum = 0
+    off_cnt = num_experts * num_experts
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
+        last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
+        tl.store(cumsum_ptr + i, last_cumsum)
+    tl.store(total_tokens_post_pad_ptr, last_cumsum)
+
+
+@triton.jit
+def moe_align_block_size_stage4(
+    topk_ids_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_idx = tl.load(cumsum_ptr + pid)
+    end_idx = tl.load(cumsum_ptr + pid + 1)
+
+    for i in range(start_idx, end_idx, block_size):
+        tl.store(expert_ids_ptr + i // block_size, pid)
+
+    start_idx = pid * tokens_per_thread
+    off_t = pid * num_experts
+
+    for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread,
+                                         numel)):
+        expert_id = tl.load(topk_ids_ptr + i)
+        token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
+        rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
+        tl.store(sorted_token_ids_ptr + rank_post_pad, i)
+        tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
+
+
+# Triton implementation based on:
+# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
+def moe_align_block_size_triton(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    numel = topk_ids.numel()
+    grid = (num_experts, )
+    tokens_cnts = torch.zeros((num_experts + 1, num_experts),
+                              dtype=torch.int32,
+                              device=topk_ids.device)
+    cumsum = torch.zeros((num_experts + 1, ),
+                         dtype=torch.int32,
+                         device=topk_ids.device)
+    tokens_per_thread = ceil_div(numel, num_experts)
+
+    moe_align_block_size_stage1[grid](
+        topk_ids,
+        tokens_cnts,
+        num_experts,
+        numel,
+        tokens_per_thread,
+    )
+    moe_align_block_size_stage2[grid](
+        tokens_cnts,
+        num_experts,
+    )
+    moe_align_block_size_stage3[(1, )](
+        num_tokens_post_pad,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+    )
+    moe_align_block_size_stage4[grid](
+        topk_ids,
+        sorted_token_ids,
+        expert_ids,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+        numel,
+        tokens_per_thread,
+    )
+
+
 def moe_align_block_size(
         topk_ids: torch.Tensor, block_size: int,
         num_experts: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -457,8 +595,28 @@ def moe_align_block_size(
     num_tokens_post_pad = torch.empty((1),
                                       dtype=torch.int32,
                                       device=topk_ids.device)
-    ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
-                             expert_ids, num_tokens_post_pad)
+    if num_experts >= 224:
+        if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON:
+            moe_align_block_size_triton(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+            )
+        else:
+            ops.sgl_moe_align_block_size(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+            )
+    else:
+        ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
+                                 expert_ids, num_tokens_post_pad)
     return sorted_ids, expert_ids, num_tokens_post_pad
 
 
-- 
GitLab


From 20579c0fae1757c9da9fc35a69960563186f3036 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 3 Feb 2025 13:40:25 +0800
Subject: [PATCH 12/65] make sure mistral_common not imported for non-mistral
 models (#12669)

When people use deepseek models, they find that they need to solve cv2
version conflict, see https://zhuanlan.zhihu.com/p/21064432691 .

I added the check, and make all imports of `cv2` lazy.

---------

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  4 +--
 ...{lazy_torch_compile.py => lazy_imports.py} | 19 ++++++++---
 vllm/multimodal/video.py                      |  3 +-
 vllm/transformers_utils/tokenizers/mistral.py | 34 ++++++++++++-------
 4 files changed, 40 insertions(+), 20 deletions(-)
 rename tests/standalone_tests/{lazy_torch_compile.py => lazy_imports.py} (56%)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d5d02fdeb..05f3c3b31 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -50,9 +50,9 @@ steps:
   - tests/multimodal
   - tests/test_utils
   - tests/worker
-  - tests/standalone_tests/lazy_torch_compile.py
+  - tests/standalone_tests/lazy_imports.py
   commands:
-  - python3 standalone_tests/lazy_torch_compile.py
+  - python3 standalone_tests/lazy_imports.py
   - pytest -v -s mq_llm_engine # MQLLMEngine
   - pytest -v -s async_engine # AsyncLLMEngine
   - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
diff --git a/tests/standalone_tests/lazy_torch_compile.py b/tests/standalone_tests/lazy_imports.py
similarity index 56%
rename from tests/standalone_tests/lazy_torch_compile.py
rename to tests/standalone_tests/lazy_imports.py
index b3b580952..61e3b3879 100644
--- a/tests/standalone_tests/lazy_torch_compile.py
+++ b/tests/standalone_tests/lazy_imports.py
@@ -8,7 +8,17 @@ from contextlib import nullcontext
 
 from vllm_test_utils import BlameResult, blame
 
-module_name = "torch._inductor.async_compile"
+# List of modules that should not be imported too early.
+# Lazy import `torch._inductor.async_compile` to avoid creating
+# too many processes before we set the number of compiler threads.
+# Lazy import `cv2` to avoid bothering users who only use text models.
+# `cv2` can easily mess up the environment.
+module_names = ["torch._inductor.async_compile", "cv2"]
+
+
+def any_module_imported():
+    return any(module_name in sys.modules for module_name in module_names)
+
 
 # In CI, we only check finally if the module is imported.
 # If it is indeed imported, we can rerun the test with `use_blame=True`,
@@ -16,8 +26,7 @@ module_name = "torch._inductor.async_compile"
 # and help find the root cause.
 # We don't run it in CI by default because it is slow.
 use_blame = False
-context = blame(
-    lambda: module_name in sys.modules) if use_blame else nullcontext()
+context = blame(any_module_imported) if use_blame else nullcontext()
 with context as result:
     import vllm  # noqa
 
@@ -25,6 +34,6 @@ if use_blame:
     assert isinstance(result, BlameResult)
     print(f"the first import location is:\n{result.trace_stack}")
 
-assert module_name not in sys.modules, (
-    f"Module {module_name} is imported. To see the first"
+assert not any_module_imported(), (
+    f"Some the modules in {module_names} are imported. To see the first"
     f" import location, run the test with `use_blame=True`.")
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 88f184399..78a2918e3 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -6,7 +6,6 @@ from io import BytesIO
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
-import cv2
 import numpy as np
 import numpy.typing as npt
 from PIL import Image
@@ -95,6 +94,8 @@ def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
     new_height, new_width = size
     resized_frames = np.empty((num_frames, new_height, new_width, channels),
                               dtype=frames.dtype)
+    # lazy import cv2 to avoid bothering users who only use text models
+    import cv2
     for i, frame in enumerate(frames):
         resized_frame = cv2.resize(frame, (new_width, new_height))
         resized_frames[i] = resized_frame
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index cecafcc78..1550f978e 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -8,21 +8,18 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast
 
 import huggingface_hub
 from huggingface_hub import HfApi, hf_hub_download
-from mistral_common.protocol.instruct.request import ChatCompletionRequest
-from mistral_common.tokens.tokenizers.base import SpecialTokens
-# yapf: disable
-from mistral_common.tokens.tokenizers.mistral import (
-    MistralTokenizer as PublicMistralTokenizer)
-# yapf: enable
-from mistral_common.tokens.tokenizers.sentencepiece import (
-    SentencePieceTokenizer)
-from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
-                                                     Tekkenizer)
 
 from vllm.logger import init_logger
 from vllm.utils import is_list_of
 
 if TYPE_CHECKING:
+    # make sure `mistral_common` is lazy imported,
+    # so that users who only use non-mistral models
+    # will not be bothered by the dependency.
+    from mistral_common.protocol.instruct.request import ChatCompletionRequest
+    from mistral_common.tokens.tokenizers.mistral import (
+        MistralTokenizer as PublicMistralTokenizer)
+
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 
 logger = init_logger(__name__)
@@ -33,7 +30,7 @@ class Encoding:
     input_ids: Union[List[int], List[List[int]]]
 
 
-def maybe_serialize_tool_calls(request: ChatCompletionRequest):
+def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
     # SEE: https://github.com/vllm-project/vllm/pull/9951
     # Credits go to: @gcalmettes
     # NOTE: There is currently a bug in pydantic where attributes
@@ -108,12 +105,16 @@ def find_tokenizer_file(files: List[str]):
 
 class MistralTokenizer:
 
-    def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
+    def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
         self.mistral = tokenizer
         self.instruct = tokenizer.instruct_tokenizer
 
         tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
+        from mistral_common.tokens.tokenizers.tekken import (
+            SpecialTokenPolicy, Tekkenizer)
         self.is_tekken = isinstance(tokenizer_, Tekkenizer)
+        from mistral_common.tokens.tokenizers.sentencepiece import (
+            SentencePieceTokenizer)
         self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer)
         if self.is_tekken:
             # Make sure special tokens will not raise
@@ -153,6 +154,8 @@ class MistralTokenizer:
             assert Path(
                 path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
 
+        from mistral_common.tokens.tokenizers.mistral import (
+            MistralTokenizer as PublicMistralTokenizer)
         mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file)
         return cls(mistral_tokenizer)
 
@@ -181,6 +184,8 @@ class MistralTokenizer:
     # by the guided structured output backends.
     @property
     def all_special_tokens_extended(self) -> List[str]:
+        from mistral_common.tokens.tokenizers.base import SpecialTokens
+
         # tekken defines its own extended special tokens list
         if hasattr(self.tokenizer, "SPECIAL_TOKENS"):
             special_tokens = self.tokenizer.SPECIAL_TOKENS
@@ -284,6 +289,8 @@ class MistralTokenizer:
         if last_message["role"] == "assistant":
             last_message["prefix"] = True
 
+        from mistral_common.protocol.instruct.request import (
+            ChatCompletionRequest)
         request = ChatCompletionRequest(messages=messages,
                                         tools=tools)  # type: ignore[type-var]
         encoded = self.mistral.encode_chat_completion(request)
@@ -292,6 +299,7 @@ class MistralTokenizer:
         return encoded.tokens
 
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        from mistral_common.tokens.tokenizers.base import SpecialTokens
         if self.is_tekken:
             tokens = [
                 t for t in tokens
@@ -363,6 +371,8 @@ class MistralTokenizer:
         ids: List[int],
         skip_special_tokens: bool = True,
     ) -> List[str]:
+        from mistral_common.tokens.tokenizers.base import SpecialTokens
+
         # TODO(Patrick) - potentially allow special tokens to not be skipped
         assert (
             skip_special_tokens
-- 
GitLab


From c5932e5daceba8a5eefece98f1a769ddf84a864b Mon Sep 17 00:00:00 2001
From: Eldar Kurtic <eldarkurtic314@gmail.com>
Date: Mon, 3 Feb 2025 06:42:18 +0100
Subject: [PATCH 13/65] Properly check if all fused layers are in the list of
 targets (#12666)

Thanks @kylesayrs for catching this!
---
 .../layers/quantization/compressed_tensors/utils.py             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index d700a0b15..4ea79531e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -236,7 +236,7 @@ def _match_fused_layer(layer_name: str,
                                for type_suffix in possible_layer_types)
 
         if is_same_parent and is_matching_type and all(
-                '.'.join([parent_path, type_suffix])
+            (f"{parent_path}.{type_suffix}" in target_layers)
                 for type_suffix in possible_layer_types):
             return target
 
-- 
GitLab


From b9986454fe8ba80e2a109d069397b6b59aae658b Mon Sep 17 00:00:00 2001
From: Srikanth Srinivas <srikanth@astrum.ai>
Date: Sun, 2 Feb 2025 21:46:19 -0800
Subject: [PATCH 14/65] Fix for attention layers to remain unquantized during
 moe_wn16 quant (#12570)

Fix to AWQ quant loading of the new R1 model

The new optimized MoE kernels for a large number of experts `moe_wn16`
uses AWQ quant which requires the attention layers to be in 16bit

The current merge has broken this, and the `get_quant_method` must
return None for it to work correctly again

---------

Signed-off-by: Srikanth Srinivas <srikanth@astrum.ai>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Beim <beim2015@outlook.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: npanpaliya <nishidha.panpaliya@partner.ibm.com>
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: simon-mo <xmo@berkeley.edu>
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Ryan N <ryan.nguyen@centml.ai>
Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Rahul Tuli <rahul@neuralmagic.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Vicente Herrera <vicenteherrera@vicenteherrera.com>
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Shawn Du <shawnd200@outlook.com>
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Beim <805908499@qq.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Nishidha <nishidha.panpaliya@partner.ibm.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: simon-mo <simon.mo@hey.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Kevin H. Luu <kevin@anyscale.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Ryan Nguyen <96593302+xpbowler@users.noreply.github.com>
Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
Co-authored-by: fade_away <1028552010@qq.com>
Co-authored-by: weilong.yu <weilong.yu@shopee.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Eldar Kurtic <eldarkurtic314@gmail.com>
Co-authored-by: Rahul Tuli <rahul@neuralmagic.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Vicente Herrera <vicenteherrera@vicenteherrera.com>
Co-authored-by: Jinzhen Lin <linjinzhen@hotmail.com>
Co-authored-by: Shawn Du <shawnd200@outlook.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/layers/quantization/moe_wna16.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 1ae765a22..56fa597e2 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -7,7 +7,8 @@ import torch
 from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
-from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
 from vllm.model_executor.layers.quantization.base_config import (
@@ -125,9 +126,7 @@ class MoeWNA16Config(QuantizationConfig):
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
             return UnquantizedLinearMethod()
-        elif isinstance(layer, FusedMoE):
-            return MoeWNA16Method(self)
-        else:
+        elif isinstance(layer, LinearBase):
             if self.linear_quant_method == "gptq":
                 if self.use_marlin:
                     return GPTQMarlinConfig.from_config(
@@ -144,6 +143,9 @@ class MoeWNA16Config(QuantizationConfig):
                         self.full_config).get_quant_method(layer, prefix)
             else:
                 raise ValueError("moe_wna16 only support gptq and awq.")
+        elif isinstance(layer, FusedMoE):
+            return MoeWNA16Method(self)
+        return None
 
 
 def is_layer_skipped_quant(prefix: str, modules_to_not_convert: List[str]):
-- 
GitLab


From ad4a9dc817f00c266d1ca210342d1865aa69db27 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 3 Feb 2025 15:58:21 +0800
Subject: [PATCH 15/65] [cuda] manually import the correct pynvml module
 (#12679)

fixes problems like https://github.com/vllm-project/vllm/pull/12635 and
https://github.com/vllm-project/vllm/pull/12636 and
https://github.com/vllm-project/vllm/pull/12565

---------

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/platforms/__init__.py |  3 ++-
 vllm/platforms/cuda.py     | 10 ++------
 vllm/utils.py              | 52 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index d34b660df..9c98942b5 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -33,7 +33,8 @@ def cuda_platform_plugin() -> Optional[str]:
     is_cuda = False
 
     try:
-        import pynvml
+        from vllm.utils import import_pynvml
+        pynvml = import_pynvml()
         pynvml.nvmlInit()
         try:
             if pynvml.nvmlDeviceGetCount() > 0:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 44d2506f0..b49852a72 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -8,7 +8,6 @@ from functools import lru_cache, wraps
 from typing import (TYPE_CHECKING, Callable, List, Optional, Tuple, TypeVar,
                     Union)
 
-import pynvml
 import torch
 from typing_extensions import ParamSpec
 
@@ -16,6 +15,7 @@ from typing_extensions import ParamSpec
 import vllm._C  # noqa
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.utils import import_pynvml
 
 from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
 
@@ -29,13 +29,7 @@ logger = init_logger(__name__)
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
 
-if pynvml.__file__.endswith("__init__.py"):
-    logger.warning(
-        "You are using a deprecated `pynvml` package. Please install"
-        " `nvidia-ml-py` instead, and make sure to uninstall `pynvml`."
-        " When both of them are installed, `pynvml` will take precedence"
-        " and cause errors. See https://pypi.org/project/pynvml "
-        "for more information.")
+pynvml = import_pynvml()
 
 # pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models
 # see https://github.com/huggingface/diffusers/issues/9704 for details
diff --git a/vllm/utils.py b/vllm/utils.py
index 3089f0951..a2b53fcf2 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2208,3 +2208,55 @@ def run_method(obj: Any, method: Union[str, bytes, Callable], args: Tuple[Any],
     else:
         func = partial(method, obj)  # type: ignore
     return func(*args, **kwargs)
+
+
+def import_pynvml():
+    """
+    Historical comments:
+
+    libnvml.so is the library behind nvidia-smi, and
+    pynvml is a Python wrapper around it. We use it to get GPU
+    status without initializing CUDA context in the current process.
+    Historically, there are two packages that provide pynvml:
+    - `nvidia-ml-py` (https://pypi.org/project/nvidia-ml-py/): The official
+        wrapper. It is a dependency of vLLM, and is installed when users
+        install vLLM. It provides a Python module named `pynvml`.
+    - `pynvml` (https://pypi.org/project/pynvml/): An unofficial wrapper.
+        Prior to version 12.0, it also provides a Python module `pynvml`,
+        and therefore conflicts with the official one. What's worse,
+        the module is a Python package, and has higher priority than
+        the official one which is a standalone Python file.
+        This causes errors when both of them are installed.
+        Starting from version 12.0, it migrates to a new module
+        named `pynvml_utils` to avoid the conflict.
+    
+    TL;DR: if users have pynvml<12.0 installed, it will cause problems.
+    Otherwise, `import pynvml` will import the correct module.
+    We take the safest approach here, to manually import the correct
+    `pynvml.py` module from the `nvidia-ml-py` package.
+    """
+    if TYPE_CHECKING:
+        import pynvml
+        return pynvml
+    if "pynvml" in sys.modules:
+        import pynvml
+        if pynvml.__file__.endswith("__init__.py"):
+            # this is pynvml < 12.0
+            raise RuntimeError(
+                "You are using a deprecated `pynvml` package. "
+                "Please uninstall `pynvml` or upgrade to at least"
+                " version 12.0. See https://pypi.org/project/pynvml "
+                "for more information.")
+        return sys.modules["pynvml"]
+    import importlib.util
+    import os
+    import site
+    for site_dir in site.getsitepackages():
+        pynvml_path = os.path.join(site_dir, "pynvml.py")
+        if os.path.exists(pynvml_path):
+            spec = importlib.util.spec_from_file_location(
+                "pynvml", pynvml_path)
+            pynvml = importlib.util.module_from_spec(spec)
+            sys.modules["pynvml"] = pynvml
+            spec.loader.exec_module(pynvml)
+            return pynvml
-- 
GitLab


From 1298a400e8e8496b6e9ce3847ada5ec9f6e6cb48 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 3 Feb 2025 15:59:49 +0800
Subject: [PATCH 16/65] [ci/build] fix gh200 test (#12681)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/check-wheel-size.py | 4 ++--
 .buildkite/run-gh200-test.sh   | 4 ++--
 Dockerfile                     | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index 2e4aecdd3..a378bc6ba 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -4,11 +4,11 @@ import os
 import sys
 import zipfile
 
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
 # Note that we have 400 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300))
+VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
 
 
 def print_top_10_largest_files(zip_file):
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index 3e4e40946..99972afa2 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -23,6 +23,6 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and test offline inference
-docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/basic.py
+docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+    python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
 '
diff --git a/Dockerfile b/Dockerfile
index 0b9f74e08..7ecb643f4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -127,7 +127,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=300
+ARG VLLM_MAX_SIZE_MB=400
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
-- 
GitLab


From a1a2aaadb9122f05667140e39cf67e5736c8b6d6 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Mon, 3 Feb 2025 14:30:38 +0100
Subject: [PATCH 17/65] [Model]: Add `transformers` backend support (#11330)

# Adds support for `transformers` as a backend

Following https://github.com/huggingface/transformers/pull/35235, a
bunch of models should already be supported, we are ramping up support
for more models.

Thanks @Isotr0py for the TP support, and @hmellor for his help as well!
This includes:
- `trust_remote_code=True` support: any model on the hub, if it
implements attention the correct way can be natively supported!!
- tensor parallel support

---------

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <41363108+Isotr0py@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .buildkite/test-pipeline.yaml              |   2 +
 docs/source/models/supported_models.md     |  76 ++++++
 requirements-common.txt                    |   2 +-
 tests/models/registry.py                   |   5 +
 tests/models/test_oot_registration.py      |   4 +-
 tests/models/test_transformers.py          |  75 ++++++
 vllm/config.py                             |  14 ++
 vllm/engine/arg_utils.py                   |  22 +-
 vllm/model_executor/model_loader/utils.py  |  61 ++++-
 vllm/model_executor/models/registry.py     |  12 +-
 vllm/model_executor/models/transformers.py | 264 +++++++++++++++++++++
 11 files changed, 528 insertions(+), 9 deletions(-)
 create mode 100644 tests/models/test_transformers.py
 create mode 100644 vllm/model_executor/models/transformers.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 05f3c3b31..a847a68a6 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -349,6 +349,7 @@ steps:
   - vllm/
   - tests/models
   commands:
+    - pytest -v -s models/test_transformers.py
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_initialization.py
 
@@ -485,6 +486,7 @@ steps:
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index afaad8818..4a0996469 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -40,6 +40,82 @@ If vLLM successfully returns text (for generative models) or hidden states (for
 Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
 Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
 
+### Transformers fallback
+
+After the merge of <gh-pr:11330>, `vllm` can fallback to models that are available in `transformers`. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned!
+
+To check if the backend is `transformers`, you can simply do this:
+
+```python 
+from vllm import LLM
+llm = LLM(model=..., task="generate")  # Name or path of your model
+llm.apply_model(lambda model: print(model.__class__))
+```
+
+If it is `TransformersModel` then it means it's based on `transformers`!
+
+#### Supported features
+
+##### LORA and quantization
+
+Both are not supported yet! Make sure to open an issue and we'll work on this together with the `transformers` team!
+
+Usually `transformers` model load weights via the `load_adapters` API, that depends on PEFT. We need to work a bit to either use this api (for now this would result in some weights not being marked as loaded) or replace modules accordingly.
+
+Hints as to how this would look like:
+
+```python
+class TransformersModel(nn.Module, SupportsLoRA):
+  def __init__(*):
+    ...
+    self.model.load_adapter(vllm_config.load_config.model_loader_extra_config["qlora_adapter_name_or_path"])
+```
+
+Blocker is that you need to specify supported lora layers, when we would ideally want to load whatever is inside the checkpoint!
+
+##### Remote code
+
+This fallback also means that any model on the hub that can be used in `transformers` with `trust_remote_code=True` that correctly implements attention can be used in production!
+
+```python 
+from vllm import LLM
+llm = LLM(model=..., task="generate", trust_remote_code=True)  # Name or path of your model
+llm.apply_model(lambda model: print(model.__class__))
+```
+
+A model just needs the following two things:
+
+```python
+from transformers import PreTrainedModel
+from torch import nn
+
+class MyAttention(nn.Module):
+
+  def forward(self, hidden_states, **kwargs): # <- kwargs are required
+
+    ...
+    attention_interface = attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+    attn_output, attn_weights = attention_interface(
+      self,
+      query_states,
+      key_states,
+      value_states,
+      **kwargs,
+    )
+    ...
+
+class MyModel(PreTrainedModel):
+  _supports_attention_backend = True
+```
+
+Here is what happens in the background:
+
+1. The config is loaded
+2. `MyModel` python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
+3. The `TransformersModel` backend is used. See `/model_executors/models/transformers`, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
+
+That's it!
+
 ### ModelScope
 
 To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable:
diff --git a/requirements-common.txt b/requirements-common.txt
index e5248572c..97e33a6db 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -5,7 +5,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.48.2  # Required for Bamba.
+transformers >= 4.48.2  # Required for Bamba model and Transformers backend.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
diff --git a/tests/models/registry.py b/tests/models/registry.py
index d0dbbf00e..8a0ade4fa 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -281,12 +281,17 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                                     speculative_model="ibm-fms/llama-160m-accelerator"),  # noqa: E501
 }
 
+_FALLBACK_MODEL = {
+    "TransformersModel": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
+}
+
 _EXAMPLE_MODELS = {
     **_TEXT_GENERATION_EXAMPLE_MODELS,
     **_EMBEDDING_EXAMPLE_MODELS,
     **_CROSS_ENCODER_EXAMPLE_MODELS,
     **_MULTIMODAL_EXAMPLE_MODELS,
     **_SPECULATIVE_DECODING_EXAMPLE_MODELS,
+    **_FALLBACK_MODEL,
 }
 
 
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index ef665baa1..f2a505596 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -15,7 +15,9 @@ def test_plugin(dummy_opt_path):
     os.environ["VLLM_PLUGINS"] = ""
     with pytest.raises(Exception) as excinfo:
         LLM(model=dummy_opt_path, load_format="dummy")
-    assert "are not supported for now" in str(excinfo.value)
+    error_msg = "has no vLLM implementation and " \
+                "the Transformers implementation is not compatible with vLLM."
+    assert (error_msg in str(excinfo.value))
 
 
 @fork_new_process_for_each_test
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
new file mode 100644
index 000000000..c6536f37c
--- /dev/null
+++ b/tests/models/test_transformers.py
@@ -0,0 +1,75 @@
+"""Test the functionality of the Transformers backend.
+
+Run `pytest tests/models/test_transformers.py`.
+"""
+from contextlib import nullcontext
+from typing import Type
+
+import pytest
+
+from ..conftest import HfRunner, VllmRunner
+from ..utils import multi_gpu_test
+from .utils import check_logprobs_close
+
+
+def check_implementation(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    example_prompts: list[str],
+    model: str,
+    **kwargs,
+):
+    max_tokens = 32
+    num_logprobs = 5
+
+    with vllm_runner(model, **kwargs) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with hf_runner(model) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize(
+    "model,model_impl",
+    [
+        ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
+        ("openai-community/gpt2", "transformers"),
+        ("ArthurZ/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
+        ("meta-llama/Llama-3.2-1B-Instruct", "auto"),
+    ])  # trust_remote_code=True by default
+def test_models(hf_runner, vllm_runner, example_prompts, model,
+                model_impl) -> None:
+
+    maybe_raises = nullcontext()
+    if model == "openai-community/gpt2" and model_impl == "transformers":
+        # Model is not backend compatible
+        maybe_raises = pytest.raises(
+            ValueError,
+            match="The Transformers implementation.*not compatible with vLLM")
+
+    with maybe_raises:
+        check_implementation(hf_runner,
+                             vllm_runner,
+                             example_prompts,
+                             model,
+                             model_impl=model_impl)
+
+
+@multi_gpu_test(num_gpus=2)
+def test_distributed(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+):
+    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
+    check_implementation(hf_runner, vllm_runner, example_prompts,
+                         "meta-llama/Llama-3.2-1B-Instruct", **kwargs)
diff --git a/vllm/config.py b/vllm/config.py
index d2d59c705..d70a63795 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -83,6 +83,12 @@ class SupportsHash(Protocol):
         ...
 
 
+class ModelImpl(str, enum.Enum):
+    AUTO = "auto"
+    VLLM = "vllm"
+    TRANSFORMERS = "transformers"
+
+
 class ModelConfig:
     """Configuration for the model.
 
@@ -167,6 +173,12 @@ class ModelConfig:
             `logits_processors` extra completion argument. Defaults to None,
             which allows no processors.
         generation_config: Configuration parameter file for generation.
+        model_impl: Which implementation of the model to use:
+            "auto" will try to use the vLLM implementation if it exists and
+                fall back to the Transformers implementation if no vLLM
+                implementation is available.
+            "vllm" will use the vLLM model implementation.
+            "transformers" will use the Transformers model implementation.
         override_generation_config: Override the generation config with the
             given config.
     """
@@ -230,6 +242,7 @@ class ModelConfig:
         generation_config: Optional[str] = None,
         enable_sleep_mode: bool = False,
         override_generation_config: Optional[Dict[str, Any]] = None,
+        model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -241,6 +254,7 @@ class ModelConfig:
         self.code_revision = code_revision
         self.rope_scaling = rope_scaling
         self.rope_theta = rope_theta
+        self.model_impl = model_impl
 
         if hf_overrides is None:
             hf_overrides = {}
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7c0e8c214..40c6fb456 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -13,10 +13,10 @@ import vllm.envs as envs
 from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
                          DecodingConfig, DeviceConfig, HfOverrides,
                          KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PoolerConfig, PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig, TaskOption, TokenizerPoolConfig,
-                         VllmConfig)
+                         ModelConfig, ModelImpl, ObservabilityConfig,
+                         ParallelConfig, PoolerConfig, PromptAdapterConfig,
+                         SchedulerConfig, SpeculativeConfig, TaskOption,
+                         TokenizerPoolConfig, VllmConfig)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -199,6 +199,7 @@ class EngineArgs:
     generation_config: Optional[str] = None
     override_generation_config: Optional[Dict[str, Any]] = None
     enable_sleep_mode: bool = False
+    model_impl: str = "auto"
 
     calculate_kv_scales: Optional[bool] = None
 
@@ -378,6 +379,18 @@ class EngineArgs:
             'qualified names that can be passed with the `logits_processors` '
             'extra completion argument. Defaults to None, which allows no '
             'processors.')
+        parser.add_argument(
+            '--model-impl',
+            type=str,
+            default=EngineArgs.model_impl,
+            choices=[f.value for f in ModelImpl],
+            help='Which implementation of the model to use.\n\n'
+            '* "auto" will try to use the vLLM implementation if it exists '
+            'and fall back to the Transformers implementation if no vLLM '
+            'implementation is available.\n'
+            '* "vllm" will use the vLLM model implementation.\n'
+            '* "transformers" will use the Transformers model '
+            'implementation.\n')
         # Parallel arguments
         parser.add_argument(
             '--distributed-executor-backend',
@@ -1017,6 +1030,7 @@ class EngineArgs:
             generation_config=self.generation_config,
             override_generation_config=self.override_generation_config,
             enable_sleep_mode=self.enable_sleep_mode,
+            model_impl=self.model_impl,
         )
 
     def create_load_config(self) -> LoadConfig:
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 084ca53b1..eb334c1fd 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -2,17 +2,22 @@
 """Utilities for selecting and loading models."""
 import contextlib
 from dataclasses import dataclass, field
-from typing import Dict, List, Tuple, Type
+from typing import Dict, List, Optional, Tuple, Type
 
 import torch
+import transformers
 from torch import nn
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, ModelImpl
+from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.models.adapters import (as_classification_model,
                                                  as_embedding_model,
                                                  as_reward_model)
 
+logger = init_logger(__name__)
+
 
 @contextlib.contextmanager
 def set_default_torch_dtype(dtype: torch.dtype):
@@ -23,6 +28,50 @@ def set_default_torch_dtype(dtype: torch.dtype):
     torch.set_default_dtype(old_dtype)
 
 
+def is_transformers_impl_compatible(
+        arch: str,
+        module: Optional[transformers.PreTrainedModel] = None) -> bool:
+    mod = module or getattr(transformers, arch, None)
+    if mod is None:
+        return False
+    if hasattr(mod, "supports_backend"):
+        return mod.is_backend_compatible()
+    else:
+        return mod._supports_flex_attn
+
+
+def resolve_transformers_fallback(model_config: ModelConfig,
+                                  architectures: list[str]):
+    for i, arch in enumerate(architectures):
+        if arch == "TransformersModel":
+            continue
+        custom_module = None
+        auto_map = getattr(model_config.hf_config, "auto_map", None)
+        if auto_map is not None and "AutoModel" in auto_map:
+            custom_module = get_class_from_dynamic_module(
+                model_config.hf_config.auto_map["AutoModel"],
+                model_config.model)
+        # TODO(Isotr0py): Further clean up these raises.
+        # perhaps handled them in _ModelRegistry._raise_for_unsupported?
+        if model_config.model_impl == ModelImpl.TRANSFORMERS:
+            if not is_transformers_impl_compatible(arch, custom_module):
+                raise ValueError(
+                    f"The Transformers implementation of {arch} is not "
+                    "compatible with vLLM.")
+            architectures[i] = "TransformersModel"
+        if model_config.model_impl == ModelImpl.AUTO:
+            if not is_transformers_impl_compatible(arch, custom_module):
+                raise ValueError(
+                    f"{arch} has no vLLM implementation and the Transformers "
+                    "implementation is not compatible with vLLM.")
+            logger.warning(
+                "%s has no vLLM implementation, falling back to Transformers "
+                "implementation. Some features may not be supported and "
+                "performance may not be optimal.", arch)
+            architectures[i] = "TransformersModel"
+    return architectures
+
+
 def get_model_architecture(
         model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
     architectures = getattr(model_config.hf_config, "architectures", [])
@@ -38,6 +87,14 @@ def get_model_architecture(
             and "MixtralForCausalLM" in architectures):
         architectures = ["QuantMixtralForCausalLM"]
 
+    vllm_supported_archs = ModelRegistry.get_supported_archs()
+    is_vllm_supported = any(arch in vllm_supported_archs
+                            for arch in architectures)
+    if (not is_vllm_supported
+            or model_config.model_impl == ModelImpl.TRANSFORMERS):
+        architectures = resolve_transformers_fallback(model_config,
+                                                      architectures)
+
     model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
     if model_config.task == "embed":
         model_cls = as_embedding_model(model_cls)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 40bbc7d16..962f95f10 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -184,6 +184,10 @@ _SPECULATIVE_DECODING_MODELS = {
     "MedusaModel": ("medusa", "Medusa"),
     "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
 }
+
+_FALLBACK_MODEL = {
+    "TransformersModel": ("transformers", "TransformersModel"),
+}
 # yapf: enable
 
 _VLLM_MODELS = {
@@ -192,6 +196,7 @@ _VLLM_MODELS = {
     **_CROSS_ENCODER_MODELS,
     **_MULTIMODAL_MODELS,
     **_SPECULATIVE_DECODING_MODELS,
+    **_FALLBACK_MODEL,
 }
 
 
@@ -378,7 +383,12 @@ class _ModelRegistry:
         if not architectures:
             logger.warning("No model architectures are specified")
 
-        return architectures
+        normalized_arch = []
+        for model in architectures:
+            if model not in self.models:
+                model = "TransformersModel"
+            normalized_arch.append(model)
+        return normalized_arch
 
     def inspect_model_cls(
         self,
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
new file mode 100644
index 000000000..ff1ae0ac8
--- /dev/null
+++ b/vllm/model_executor/models/transformers.py
@@ -0,0 +1,264 @@
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wrapper around `transformers` models"""
+import re
+from typing import Iterable, List, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import AutoModel, PreTrainedModel
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.utils import divide
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .utils import maybe_prefix
+
+logger = init_logger(__name__)
+
+
+def vllm_flash_attention_forward(
+        # Transformers args
+        module: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: torch.Tensor,
+        # Transformers kwargs
+        scaling: float = None,
+        # vLLM kwargs
+        attn_metadata: AttentionMetadata = None,
+        attention_instances: list[Attention] = None,
+        **kwargs):
+    self_attn = attention_instances[module.layer_idx]
+    if scaling is not None:
+        self_attn.impl.scale = float(scaling)
+    hidden = query.shape[-2]
+    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+    query, key, value = (x.reshape(hidden, -1) for x in (query, key, value))
+    return self_attn.forward(
+        query,
+        key,
+        value,
+        kv_cache=None,  # argument not used
+        attn_metadata=attn_metadata), None
+
+
+ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward
+
+
+# Linear Layer that is compatible with transformers internal forward
+# TODO: This is a temporary solution, we should find a better way to integrate
+class HFColumnParallelLinear(ColumnParallelLinear):
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return super().forward(input)[0]
+
+
+class HFRowParallelLinear(RowParallelLinear):
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return super().forward(input)[0]
+
+
+def replace_tp_linear_class(orig_module: nn.Linear,
+                            style: str,
+                            quant_config=None):
+    """
+    In model configurations, we use a neutral type (string) to specify parallel
+    styles, here we use it to translate nn.Linear into vllm-style tp Linear.
+
+    Quant config is not supported yet
+    """
+
+    if not isinstance(style, str):
+        raise ValueError(
+            f"Unsupported parallel style type {type(style)}, expected str")
+
+    input_size = orig_module.in_features
+    output_size = orig_module.out_features
+    bias = orig_module.bias is not None
+
+    if style == "colwise":
+        return HFColumnParallelLinear(
+            input_size,
+            output_size,
+            bias,
+        )
+    elif style == "rowwise":
+        return HFRowParallelLinear(
+            input_size,
+            output_size,
+            bias,
+        )
+    # We don't consider colwise_rep since it's used in lm_head
+    else:
+        raise ValueError(f"Unsupported parallel style value: {style}")
+
+
+class TransformersModel(nn.Module):
+    embedding_padding_modules = ["lm_head"]
+    embedding_modules = ["embed_tokens"
+                         ]  # TODO transformers will have a util to get it
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        logger.info("Using Transformers backend.")
+
+        self.vllm_config = vllm_config
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.quant_config = quant_config
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.unpadded_vocab_size = config.vocab_size
+
+        self.model: PreTrainedModel = AutoModel.from_config(
+            self.config,
+            attn_implementation="vllm",
+            torch_dtype=vllm_config.model_config.dtype,
+            trust_remote_code=vllm_config.model_config.trust_remote_code,
+        )
+        prefix = self.model.base_model_prefix
+
+        # MLP modifications
+        self.tensor_parallelize(self.model)
+
+        # Attention modifications (assumes 1 attention op per hidden layer)
+        tp_size = get_tensor_model_parallel_world_size()
+        self.attention_instances = [
+            Attention(
+                num_heads=divide(config.num_attention_heads, tp_size),
+                head_size=config.head_dim,
+                # NOTE: We use Llama scale as default, if it's set by
+                # Transformers, it's updated in vllm_flash_attention_forward
+                scale=config.head_dim**-0.5,
+                num_kv_heads=divide(config.num_key_value_heads, tp_size),
+                cache_config=cache_config,
+                quant_config=None,
+                prefix=f"{i}.attn") for i in range(config.num_hidden_layers)
+        ]
+
+        # Model modifications
+        self.replace_vocab_embed_class(self.model)
+
+        # ForCausalLM modifications
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=None,
+                                      prefix=maybe_prefix(prefix, "lm_head"))
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.get_input_embeddings().weight
+
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+        self.sampler = get_sampler()
+
+    def log_replacement(self, name: str, old_module: nn.Module,
+                        new_module: nn.Module):
+        logger.debug("%s: %s -> %s", name, old_module, new_module)
+
+    def tensor_parallelize(self, module: nn.Module, prefix: str = ""):
+        if (self.config.base_model_tp_plan is None
+                and self.vllm_config.parallel_config.tensor_parallel_size > 1):
+            raise ValueError(
+                "Trying to run tensor parallelization but the model does not "
+                "support it yet!")
+
+        for child_name, child_module in module.named_children():
+            qual_name = prefix + child_name
+            for pattern, style in self.config.base_model_tp_plan.items():
+                if re.match(pattern, qual_name) and isinstance(
+                        child_module, nn.Linear):
+                    new_module = replace_tp_linear_class(
+                        child_module, style, self.quant_config)
+                    setattr(module, child_name, new_module)
+                    self.log_replacement(qual_name, child_module, new_module)
+            else:
+                self.tensor_parallelize(child_module, prefix=f"{qual_name}.")
+
+    def replace_vocab_embed_class(self, module: nn.Module):
+        # Use native set input embeddings
+        new_module = VocabParallelEmbedding(
+            self.vocab_size,
+            self.config.hidden_size,
+            org_num_embeddings=self.config.vocab_size,
+            quant_config=None,
+        )
+        self.log_replacement("input embedding",
+                             self.model.get_input_embeddings(), new_module)
+        self.model.set_input_embeddings(new_module)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],  # argument not used
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(
+            input_ids[None, ...],
+            use_cache=False,
+            position_ids=positions[None, ...],
+            attn_metadata=attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            attention_instances=self.attention_instances,
+            return_dict=False)[0][0, ...]  # we remove batch dimension for now
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(self, logits: torch.Tensor,
+               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
+
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if name not in params_dict:
+                name = f"{self.model.base_model_prefix}.{name}"
+            if name in params_dict:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+        return loaded_params
-- 
GitLab


From 33e0602e59cfb37ab0bfdff5ea6802aeb3a3ecc9 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Mon, 3 Feb 2025 14:16:59 -0500
Subject: [PATCH 18/65] [Misc] Fix improper placement of SPDX header in scripts
 (#12694)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 cmake/hipify.py                              |  3 +--
 tests/models/test_transformers.py            |  1 +
 tools/check_spdx_header.py                   | 17 ++++++++++++-----
 tools/report_build_time_ninja.py             |  2 +-
 vllm/attention/ops/triton_flash_attention.py |  3 +--
 vllm/model_executor/models/transformers.py   |  1 +
 6 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/cmake/hipify.py b/cmake/hipify.py
index 2e0c8a172..a15577125 100755
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@@ -1,6 +1,5 @@
-# SPDX-License-Identifier: Apache-2.0
-
 #!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
 
 #
 # A command line tool for running pytorch's hipify preprocessor on CUDA
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index c6536f37c..1d5d9729d 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test the functionality of the Transformers backend.
 
 Run `pytest tests/models/test_transformers.py`.
diff --git a/tools/check_spdx_header.py b/tools/check_spdx_header.py
index 3f7fd66bf..709befc53 100644
--- a/tools/check_spdx_header.py
+++ b/tools/check_spdx_header.py
@@ -10,18 +10,25 @@ def check_spdx_header(file_path):
     with open(file_path, encoding='UTF-8') as file:
         lines = file.readlines()
         if not lines:
-            # not necessary for an empty file like __init__.py
+            # Empty file like __init__.py
             return True
-        if not lines[0].strip().startswith(SPDX_HEADER_PREFIX):
-            return False
-    return True
+        for line in lines:
+            if line.strip().startswith(SPDX_HEADER_PREFIX):
+                return True
+    return False
 
 
 def add_header(file_path):
     with open(file_path, 'r+', encoding='UTF-8') as file:
         lines = file.readlines()
         file.seek(0, 0)
-        file.write(SPDX_HEADER + '\n\n' + ''.join(lines))
+        if lines and lines[0].startswith("#!"):
+            file.write(lines[0])
+            file.write(SPDX_HEADER + '\n')
+            file.writelines(lines[1:])
+        else:
+            file.write(SPDX_HEADER + '\n')
+            file.writelines(lines)
 
 
 def main():
diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py
index 33e85b9ff..011af2522 100644
--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@@ -1,6 +1,6 @@
+#!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
 
-#!/usr/bin/env python3
 # Copyright (c) 2018 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index ab8fb8953..745818eb6 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -1,6 +1,5 @@
-# SPDX-License-Identifier: Apache-2.0
-
 #!/usr/bin/env python
+# SPDX-License-Identifier: Apache-2.0
 """
 Fused Attention
 ===============
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index ff1ae0ac8..160beaa14 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 # Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
-- 
GitLab


From c11de33dad217bca79225128059b6fac7e1b2519 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 3 Feb 2025 16:04:59 -0500
Subject: [PATCH 19/65] [Bugfix][Kernel] Fix per-token/per-channel quantization
 for Hopper scaled mm (#12696)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 .../cutlass_w8a8/scaled_mm_c3x.cu             | 59 ++++++++-----------
 1 file changed, 24 insertions(+), 35 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 72d549e59..e40f28229 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -16,29 +16,11 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
-  using GroupShape = std::array<int64_t, 2>;
-
   int M = a.size(0), N = b.size(1), K = a.size(1);
 
-  GroupShape a_scale_group_shape = [&, &s = a_scales]() -> GroupShape {
-    if (s.numel() == 1) return {M, K};  // tensor-wise
-    if (s.dim() == 2)
-      return {ceil_div(a.size(0), s.size(0)), ceil_div(a.size(1), s.size(1))};
-    TORCH_CHECK(false, "Unsupported scale shape for scale_a");
-  }();
-
-  GroupShape b_scale_group_shape = [&, &s = b_scales]() -> GroupShape {
-    if (s.numel() == 1) return {K, N};  // tensor-wise
-    if (s.dim() == 2)
-      return {ceil_div(b.size(0), s.size(0)), ceil_div(b.size(1), s.size(1))};
-    TORCH_CHECK(false, "Unsupported scale shape for scale_b");
-  }();
-
-  if ((a_scale_group_shape == GroupShape{M, K} ||
-       a_scale_group_shape == GroupShape{1, K}) &&
-      (b_scale_group_shape == GroupShape{K, N} ||
-       b_scale_group_shape == GroupShape{K, 1})) {
-    // "standard per-tensor/per-token/per-channel" scaling
+  if ((a_scales.numel() == 1 || a_scales.numel() == a.size(0)) &&
+      (b_scales.numel() == 1 || b_scales.numel() == b.size(1))) {
+    // Standard per-tensor/per-token/per-channel scaling
     TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
     if (a.dtype() == torch::kFloat8_e4m3fn) {
       vllm::cutlass_scaled_mm_sm90_fp8(c, a, b, a_scales, b_scales, bias);
@@ -46,25 +28,32 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
       TORCH_CHECK(a.dtype() == torch::kInt8);
       vllm::cutlass_scaled_mm_sm90_int8(c, a, b, a_scales, b_scales, bias);
     }
-  } else if (a_scale_group_shape == GroupShape{1, 128} &&
-             b_scale_group_shape == GroupShape{128, 128}) {
+  } else {
+    using GroupShape = std::array<int64_t, 2>;
+    auto make_group_shape = [](torch::Tensor const& x,
+                               torch::Tensor const& s) -> GroupShape {
+      TORCH_CHECK(s.dim() == 2, "cutlass_scaled_mm group scales must be 2D");
+      return {ceil_div(x.size(0), s.size(0)), ceil_div(x.size(1), s.size(1))};
+    };
+
+    GroupShape a_scale_group_shape = make_group_shape(a, a_scales);
+    GroupShape b_scale_group_shape = make_group_shape(b, b_scales);
+
     // 1x128 per-token group scales for activations
     // 128x128 blockwise scales for weights
-    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn &&
-                    b.dtype() == torch::kFloat8_e4m3fn,
-                "Currently only FP8 is supported for A group shape 1x128 and "
-                "B group shape 128x128");
-    TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm");
-
-    vllm::cutlass_scaled_mm_blockwise_sm90_fp8(c, a, b, a_scales, b_scales);
-  } else {
-    TORCH_CHECK(false,
-                "Unsupported scale group shapes for CUTLASS 3.x GEMM.\n "
-                "a_scale_group_shape must be [1, 128], got: [",
+    TORCH_CHECK((a_scale_group_shape == GroupShape{1, 128} &&
+                 b_scale_group_shape == GroupShape{128, 128} &&
+                 a.dtype() == torch::kFloat8_e4m3fn &&
+                 b.dtype() == torch::kFloat8_e4m3fn),
+                "cutlass_scaled_mm only supports datatype float8_e4m3fn.\n"
+                "a_scale_group_shape must be [1, 128]. Got: [",
                 a_scale_group_shape[0], ", ", a_scale_group_shape[1],
                 "]\n"
-                "b_scale_group_shape must be [128, 128], got: [",
+                "b_scale_group_shape must be [128, 128]. Got: [",
                 b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]");
+    TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm");
+
+    vllm::cutlass_scaled_mm_blockwise_sm90_fp8(c, a, b, a_scales, b_scales);
   }
 }
 
-- 
GitLab


From 6dd5e52823cc0ca8ddc9c4377d29ead37cc09a95 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 3 Feb 2025 16:29:56 -0500
Subject: [PATCH 20/65] Squelch MLA warning for Compressed-Tensors Models
 (#12704)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 vllm/config.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index d70a63795..2f4a7ad76 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -986,6 +986,9 @@ class ModelConfig:
 
     @property
     def use_mla(self) -> bool:
+        if not self.is_deepseek_mla or envs.VLLM_MLA_DISABLE:
+            return False
+
         if self.quantization is not None and self.quantization not in [\
             "fp8", "compressed-tensors"]:
             logger.warning(
@@ -1012,8 +1015,7 @@ class ModelConfig:
                         quant_config)
                     return False
 
-        use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE)
-        return use_mla
+        return True
 
     @property
     def supported_runner_types(self) -> Set[RunnerType]:
-- 
GitLab


From 4797dad3ec48c2b1fa8a1e4cc53c7854675b6b8d Mon Sep 17 00:00:00 2001
From: kushanam <42385577+kushanam@users.noreply.github.com>
Date: Mon, 3 Feb 2025 13:30:39 -0800
Subject: [PATCH 21/65] [Model] Add Deepseek V3 fp8_w8a8 configs for B200
 (#12707)

---
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 13 files changed, 1898 insertions(+)
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json

diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..77ba0d747
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..0a5d7bfdb
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..cb91a279d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..7febe3d27
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..9d7658bfc
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..03dba5ad1
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..9a5ff48b8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..386928de1
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..9c908e804
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..f78e7060e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..1d3ce5c94
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..3ab5796ee
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..3cb7eaa07
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
-- 
GitLab


From cf58b9c4cab7a90e56a45d30edfc43912b9a0a56 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Mon, 3 Feb 2025 13:34:16 -0800
Subject: [PATCH 22/65] [MISC] Remove model input dumping when exception
 (#12582)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
---
 .github/ISSUE_TEMPLATE/400-bug-report.yml     |  9 ---
 .../test_basic_correctness.py                 | 58 ------------------
 vllm/worker/model_runner.py                   |  3 +-
 vllm/worker/model_runner_base.py              | 61 +------------------
 4 files changed, 3 insertions(+), 128 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml
index 30db1721a..d4113da8b 100644
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@@ -30,15 +30,6 @@ body:
       </details>
   validations:
     required: true
-- type: textarea
-  attributes:
-    label: Model Input Dumps
-    description: |
-      If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
-    placeholder: |
-      Upload the dumped input file.
-  validations:
-    required: false
 - type: textarea
   attributes:
     label: 🐛 Describe the bug
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 2792dfde7..f001a8935 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -4,16 +4,12 @@
 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
 """
 import os
-import pickle
-import re
 import weakref
-from unittest.mock import patch
 
 import pytest
 
 from vllm import LLM
 from vllm.platforms import current_platform
-from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 
 from ..conftest import VllmRunner
 from ..models.utils import check_outputs_equal
@@ -151,57 +147,3 @@ def test_models_distributed(
         name_0="hf",
         name_1="vllm",
     )
-
-
-@pytest.mark.skip_v1
-def test_model_with_failure(vllm_runner) -> None:
-    try:
-        with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
-                   side_effect=ValueError()):
-            with pytest.raises(ValueError) as exc_info:
-                vllm_runner("facebook/opt-125m",
-                            dtype="half",
-                            enforce_eager=False,
-                            gpu_memory_utilization=0.7)
-            matches = re.search(r"input dumped to (.+).pkl",
-                                str(exc_info.value))
-            assert matches is not None
-            filename = f"{matches.group(1)}.pkl"
-
-        with open(filename, "rb") as filep:
-            inputs = pickle.load(filep)
-
-        if any(key not in inputs for key in ("arg_1", "arg_2", "arg_3")):
-            raise AssertionError("Missing keys in dumped inputs. Dumped keys: "
-                                 f"{list(inputs.keys())}")
-        assert isinstance(inputs["arg_1"],
-                          ModelInputForGPUWithSamplingMetadata)
-    finally:
-        os.remove(filename)
-
-
-@pytest.mark.skip_v1
-def test_failure_with_async_out_proc(vllm_runner) -> None:
-
-    filename = None
-    try:
-        with vllm_runner("facebook/opt-125m",
-                         dtype="half",
-                         enforce_eager=False,
-                         gpu_memory_utilization=0.7) as vllm_model,\
-             patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
-                       side_effect=ValueError()):
-            model_config = vllm_model.model.llm_engine.model_config
-            assert model_config.use_async_output_proc
-            with pytest.raises(ValueError) as exc_info:
-                vllm_model.generate_greedy('how to make pizza?', 250)
-            matches = re.search(r"input dumped to (.+).pkl",
-                                str(exc_info.value))
-            assert matches is not None
-
-            filename = f"{matches.group(1)}.pkl"
-    finally:
-        # Clean up
-        if filename is not None:
-            os.remove(filename)
-        pass
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 90f08b1df..0bbba55b3 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -57,7 +57,7 @@ from vllm.worker.model_runner_base import (
     _add_attn_metadata_broadcastable_dict,
     _add_sampling_metadata_broadcastable_dict,
     _init_attn_metadata_from_tensor_dict,
-    _init_sampling_metadata_from_tensor_dict, dump_input_when_exception)
+    _init_sampling_metadata_from_tensor_dict)
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -1647,7 +1647,6 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
                                    virtual_engine=virtual_engine)
 
     @torch.inference_mode()
-    @dump_input_when_exception(exclude_args=[0], exclude_kwargs=["self"])
     def execute_model(
         self,
         model_input: ModelInputForGPUWithSamplingMetadata,
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 9e33ef9f1..38d2b712e 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -1,16 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import dataclasses
-import pickle
 from abc import ABC, abstractmethod
-from datetime import datetime
-from functools import wraps
-from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List,
-                    Optional, Type, TypeVar)
+from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type,
+                    TypeVar)
 
 import torch
 import torch.nn as nn
-from torch import is_tensor
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
@@ -107,59 +103,6 @@ def _init_frozen_model_input_from_tensor_dict(
     return tensor_dict
 
 
-def dump_input_when_exception(exclude_args: Optional[List[int]] = None,
-                              exclude_kwargs: Optional[List[str]] = None):
-
-    def _inner(func):
-
-        @wraps(func)
-        def _wrapper(*args, **kwargs):
-            try:
-                return func(*args, **kwargs)
-            except Exception as err:
-                timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
-                filename = f"/tmp/err_{func.__name__}_input_{timestamp}.pkl"
-                logger.info("Writing input of failed execution to %s...",
-                            filename)
-                with open(filename, "wb") as filep:
-                    dumped_inputs = {
-                        k: v
-                        for k, v in kwargs.items()
-                        if k not in (exclude_kwargs or [])
-                    }
-                    for i, arg in enumerate(args):
-                        if i not in (exclude_args or []):
-                            dumped_inputs[f"arg_{i}"] = arg
-
-                    # Only persist dtype and shape for kvcache tensors
-                    # (can be way to big otherwise)
-                    if (kv_caches := dumped_inputs.get("kv_caches")) \
-                        and isinstance(kv_caches, Iterable):
-                        dumped_inputs["kv_caches"] = [(t.dtype, t.shape)
-                                                      for t in kv_caches
-                                                      if is_tensor(t)]
-
-                    try:
-                        pickle.dump(dumped_inputs, filep)
-                    except Exception as pickle_err:
-                        logger.warning(
-                            "Failed to pickle inputs of failed execution: %s",
-                            str(pickle_err))
-                        raise type(err)(f"Error in model execution: "
-                                        f"{str(err)}") from err
-
-                    logger.info(
-                        "Completed writing input of failed execution to %s.",
-                        filename)
-                raise type(err)(
-                    f"Error in model execution (input dumped to {filename}): "
-                    f"{str(err)}") from err
-
-        return _wrapper
-
-    return _inner
-
-
 class BroadcastableModelInput(ABC):
 
     @abstractmethod
-- 
GitLab


From 5095e966069b9e65b7c4c63427e06cebacaad0a0 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Mon, 3 Feb 2025 15:04:53 -0800
Subject: [PATCH 23/65] [V1] Revert `uncache_blocks` and support recaching full
 blocks (#12415)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
---
 tests/v1/core/test_prefix_caching.py | 30 --------------------
 vllm/v1/core/kv_cache_manager.py     | 41 +++++++++++-----------------
 2 files changed, 16 insertions(+), 55 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 2e16d7d25..a6c0162d3 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -629,33 +629,3 @@ def test_reset_prefix_cache():
     assert manager.reset_prefix_cache()
     assert not manager.cached_block_hash_to_block
     assert all([blk.block_hash is None for blk in manager.block_pool])
-
-
-def test_uncache_blocks():
-    manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
-        max_model_len=8192,
-        sliding_window=None,
-        enable_caching=True,
-        num_preallocate_tokens=0,
-    )
-
-    req0 = make_request("0", list(range(30)))
-    blocks = manager.allocate_slots(req0, 30)
-    assert [b.block_id for b in blocks] == [0, 1]
-    assert len(manager.cached_block_hash_to_block) == 1
-
-    req0.num_computed_tokens = 30
-
-    # Simulate speculative tokens.
-    for _ in range(5):
-        req0.append_output_token_ids(8)
-    manager.allocate_slots(req0, 5)
-    assert len(manager.cached_block_hash_to_block) == 2
-
-    # After sampling, assuming only 1 token is accepted.
-    req0.num_computed_tokens = 31
-    num_uncached_blocks = manager.uncache_blocks(req0)
-    assert num_uncached_blocks == 1
-    assert len(manager.cached_block_hash_to_block) == 1
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 94086e4a1..de349ec12 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -252,29 +252,6 @@ class KVCacheManager:
             if block.ref_cnt == 0:
                 self.free_block_queue.append(block)
 
-    def uncache_blocks(self, request: Request) -> int:
-        """Uncache the blocks that are no longer full based on the
-        num_computed_tokens in the given request. This happens when
-        the blocks were full and cached due to speculative tokens, but the
-        speculative tokens are not accepted.
-
-        Args:
-            request: The request.
-
-        Returns:
-            The number of uncached blocks.
-        """
-        blocks = self.req_to_blocks[request.request_id]
-        num_computed_tokens = request.num_computed_tokens
-        num_full_blocks = num_computed_tokens // self.block_size
-        num_uncached_blocks = 0
-        for block in blocks[num_full_blocks:]:
-            # If the block is not cached, the following blocks are not cached.
-            if not self._maybe_evict_cached_block(block):
-                break
-            num_uncached_blocks += 1
-        return num_uncached_blocks
-
     def reset_prefix_cache(self) -> bool:
         """Reset prefix cache. This function may be used in RLHF
         flows to invalid prefix caching after the weights are updated,
@@ -470,8 +447,22 @@ class KVCacheManager:
             assert prev_block.block_hash is not None
             prev_block_hash_value = prev_block.block_hash.hash_value
 
-        for i, blk in enumerate(full_blocks):
-            blk_idx = blk_start_idx + i
+        # Find the first uncached block. This case should only happen when
+        # speculative decoding is used.
+        offset = 0
+        for blk in full_blocks:
+            if blk.block_hash is None:
+                break
+            else:
+                prev_block_hash_value = blk.block_hash.hash_value
+                offset += 1
+        else:
+            # All blocks are cached.
+            return
+
+        for i, blk in enumerate(full_blocks[offset:]):
+            blk_idx = blk_start_idx + offset + i
+            assert blk.block_hash is None
 
             if blk_idx < num_cached_block_hashes:
                 # The block hash may already be computed in
-- 
GitLab


From 73b35cca7f3745d07d439c197768b25d88b6ab7f Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Mon, 3 Feb 2025 19:28:20 -0500
Subject: [PATCH 24/65] [Core] Improve hash collision avoidance in prefix
 caching (#12621)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 tests/core/block/test_prefix_caching_block.py |  4 +-
 vllm/core/block/prefix_caching_block.py       | 42 +++++++++++++++----
 vllm/v1/core/kv_cache_utils.py                |  9 ++++
 3 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index 771627a57..bf40b334a 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -65,8 +65,8 @@ class TestPrefixCachingBlock:
 
         previous_block = MagicMock(spec=PrefixCachingBlock)
         prev_block_hash = random.randint(0, 1000)
-        previous_block.content_hash = (prev_block_hash
-                                       if prev_block_has_hash else None)
+        previous_block.content_hash = (prev_block_hash if prev_block_has_hash
+                                       else hash('None'))
 
         num_to_fill = block_size if is_curr_block_full else random.randint(
             0, block_size - 1)
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index fbf19e1b4..1ca9e49da 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -65,6 +65,15 @@ class PrefixCachingBlockAllocator(BlockAllocator):
             from 0 to num_blocks - 1.
     """
 
+    # Note that we use 'None' as a string here instead of None because
+    # as of Python 3.12, hash(None) returns a constant predictable value.
+    # This could possibly make it easier to find and exploit hash
+    # collisions. 'None' as a string will be hashed differently per process,
+    # but consistently within the same process. This is the same as the
+    # behavior of None prior to Python 3.12.
+    _none_hash: int = hash('None')
+
+    # Implements Block.Factory.
     def __init__(
         self,
         num_blocks: int,
@@ -122,7 +131,6 @@ class PrefixCachingBlockAllocator(BlockAllocator):
 
         self.metric_data = CacheMetricData()
 
-    # Implements Block.Factory.
     def _create_block(
         self,
         prev_block: Optional[Block],
@@ -737,6 +745,14 @@ class PrefixCachingBlock(Block):
             such as adapters that influence the block, apart from the token_ids.
     """
 
+    # Note that we use 'None' as a string here instead of None because
+    # as of Python 3.12, hash(None) returns a constant predictable value.
+    # This could possibly make it easier to find and exploit hash
+    # collisions. 'None' as a string will be hashed differently per process,
+    # but consistently within the same process. This is the same as the
+    # behavior of None prior to Python 3.12.
+    _none_hash: int = hash('None')
+
     def __init__(
         self,
         prev_block: Optional[Block],
@@ -891,13 +907,13 @@ class PrefixCachingBlock(Block):
 
         is_first_block = self._prev_block is None
         prev_block_hash = (
-            None if is_first_block else
+            self._none_hash if is_first_block else
             self._prev_block.content_hash  # type: ignore
         )
 
         # Previous block exists but does not yet have a hash.
         # Return no hash in this case.
-        if prev_block_hash is None and not is_first_block:
+        if prev_block_hash == self._none_hash and not is_first_block:
             return None
 
         self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
@@ -907,8 +923,9 @@ class PrefixCachingBlock(Block):
             extra_hash=self._extra_hash)
         return self._cached_content_hash
 
-    @staticmethod
-    def hash_block_tokens(is_first_block: bool,
+    @classmethod
+    def hash_block_tokens(cls,
+                          is_first_block: bool,
                           prev_block_hash: Optional[int],
                           cur_block_token_ids: List[int],
                           extra_hash: Optional[int] = None) -> int:
@@ -929,7 +946,8 @@ class PrefixCachingBlock(Block):
         Returns:
         - int: The computed hash value for the block.
         """
-        assert (prev_block_hash is None) == is_first_block
+        if is_first_block and prev_block_hash is None:
+            prev_block_hash = cls._none_hash
         return hash((is_first_block, prev_block_hash, *cur_block_token_ids,
                      extra_hash))
 
@@ -949,6 +967,14 @@ class ComputedBlocksTracker:
     cached block hashes in the allocator.
     """
 
+    # Note that we use 'None' as a string here instead of None because
+    # as of Python 3.12, hash(None) returns a constant predictable value.
+    # This could possibly make it easier to find and exploit hash
+    # collisions. 'None' as a string will be hashed differently per process,
+    # but consistently within the same process. This is the same as the
+    # behavior of None prior to Python 3.12.
+    _none_hash: int = hash('None')
+
     def __init__(
         self,
         allocator: DeviceAwareBlockAllocator,
@@ -994,7 +1020,7 @@ class ComputedBlocksTracker:
         # We need to know the hash of the previous block to compute the hash of
         # the current block so that blocks could be uniquely identified across
         # sequences of prefixes.
-        prev_block_hash = (None if cur_num_blocks_recorded == 0 else
+        prev_block_hash = (self._none_hash if cur_num_blocks_recorded == 0 else
                            block_hashes_recorded[-1])
         # Only update the computed block hashes for the new blocks
         for i in range(cur_num_blocks_recorded, num_computed_blocks):
@@ -1009,7 +1035,7 @@ class ComputedBlocksTracker:
             # This has to be kept in sync with the allocator's hash
             # calculation.
             block_hash = PrefixCachingBlock.hash_block_tokens(
-                is_first_block=prev_block_hash is None,
+                is_first_block=prev_block_hash == self._none_hash,
                 prev_block_hash=prev_block_hash,
                 cur_block_token_ids=block_token_ids,
                 extra_hash=extra_hash,
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index c801ab9e4..e0976ba85 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -263,6 +263,15 @@ def hash_block_tokens(
         The hash value of the block and the token ids in the block.
         The entire tuple is used as the hash key of the block.
     """
+    if not parent_block_hash:
+        # Note that we use 'None' as a string here instead of None because
+        # as of Python 3.12, hash(None) returns a constant predictable value.
+        # This could possibly make it easier to find and exploit hash
+        # collisions. 'None' as a string will be hashed differently per process,
+        # but consistently within the same process. This is the same as the
+        # behavior of None prior to Python 3.12.
+        parent_block_hash = hash('None')
+
     curr_block_token_ids_tuple = tuple(curr_block_token_ids)
     return BlockHashType(
         hash((parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
-- 
GitLab


From 5d98d56089426555d303d82bdf29b9ffc825ec20 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 3 Feb 2025 22:55:46 -0500
Subject: [PATCH 25/65] Support Pixtral-Large HF by using llava
 multimodal_projector_bias config (#12710)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/models/llava.py            | 6 ++++--
 vllm/model_executor/models/llava_next.py       | 3 ++-
 vllm/model_executor/models/llava_next_video.py | 9 +++++----
 vllm/model_executor/models/llava_onevision.py  | 4 ++--
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index de3777cad..19effcbfc 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -75,19 +75,20 @@ class LlavaMultiModalProjector(nn.Module):
                  vision_hidden_size: int,
                  text_hidden_size: int,
                  projector_hidden_act: str,
+                 multimodal_projector_bias: bool,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = ""):
         super().__init__()
 
         self.linear_1 = ColumnParallelLinear(vision_hidden_size,
                                              text_hidden_size,
-                                             bias=True,
+                                             bias=multimodal_projector_bias,
                                              quant_config=quant_config,
                                              prefix=f"{prefix}.linear_1")
         self.act = get_act_fn(projector_hidden_act)
         self.linear_2 = RowParallelLinear(text_hidden_size,
                                           text_hidden_size,
-                                          bias=True,
+                                          bias=multimodal_projector_bias,
                                           quant_config=quant_config,
                                           prefix=f"{prefix}.linear_2")
 
@@ -503,6 +504,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
             vision_hidden_size=config.vision_config.hidden_size,
             text_hidden_size=config.text_config.hidden_size,
             projector_hidden_act=config.projector_hidden_act,
+            multimodal_projector_bias=config.multimodal_projector_bias,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "multi_modal_projector"))
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 185edcb8d..defdeb54a 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -231,7 +231,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.multi_modal_projector = LlavaMultiModalProjector(
             vision_hidden_size=vision_hidden_size,
             text_hidden_size=config.text_config.hidden_size,
-            projector_hidden_act=config.projector_hidden_act)
+            projector_hidden_act=config.projector_hidden_act,
+            multimodal_projector_bias=config.multimodal_projector_bias)
 
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index a50025135..d70ae2f14 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -253,16 +253,16 @@ class LlavaNextVideoPooler(nn.Module):
 class LlavaNextMultiModalProjector(nn.Module):
 
     def __init__(self, vision_hidden_size: int, text_hidden_size: int,
-                 projector_hidden_act: str):
+                 projector_hidden_act: str, multimodal_projector_bias: bool):
         super().__init__()
 
         self.linear_1 = nn.Linear(vision_hidden_size,
                                   text_hidden_size,
-                                  bias=True)
+                                  bias=multimodal_projector_bias)
         self.act = get_act_fn(projector_hidden_act)
         self.linear_2 = nn.Linear(text_hidden_size,
                                   text_hidden_size,
-                                  bias=True)
+                                  bias=multimodal_projector_bias)
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         hidden_states = self.linear_1(image_features)
@@ -298,7 +298,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.multi_modal_projector = LlavaNextMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
             text_hidden_size=config.text_config.hidden_size,
-            projector_hidden_act=config.projector_hidden_act)
+            projector_hidden_act=config.projector_hidden_act,
+            multimodal_projector_bias=config.multimodal_projector_bias)
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=config.text_config,
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index ac502000c..f1c06cd85 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -372,11 +372,11 @@ class LlavaOnevisionMultiModalProjector(nn.Module):
 
         self.linear_1 = nn.Linear(config.vision_config.hidden_size,
                                   config.text_config.hidden_size,
-                                  bias=True)
+                                  bias=config.multimodal_projector_bias)
         self.act = get_act_fn(config.projector_hidden_act)
         self.linear_2 = nn.Linear(config.text_config.hidden_size,
                                   config.text_config.hidden_size,
-                                  bias=True)
+                                  bias=config.multimodal_projector_bias)
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         hidden_states = self.linear_1(image_features)
-- 
GitLab


From bb392af434a49b3f8655f0e78737ced6524056b7 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 4 Feb 2025 02:05:04 -0500
Subject: [PATCH 26/65] [Doc] Replace ibm-fms with ibm-ai-platform (#12709)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 docs/source/features/spec_decode.md           | 12 ++++++------
 examples/offline_inference/mlpspeculator.py   |  2 +-
 tests/models/registry.py                      |  2 +-
 tests/spec_decode/e2e/test_mlp_correctness.py |  2 +-
 vllm/model_executor/models/mlp_speculator.py  |  2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index da8712705..1e468962c 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -131,7 +131,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
     model="meta-llama/Meta-Llama-3.1-70B-Instruct",
     tensor_parallel_size=4,
-    speculative_model="ibm-fms/llama3-70b-accelerator",
+    speculative_model="ibm-ai-platform/llama3-70b-accelerator",
     speculative_draft_tensor_parallel_size=1,
 )
 outputs = llm.generate(prompts, sampling_params)
@@ -149,11 +149,11 @@ limitation will be fixed in a future release.
 
 A variety of speculative models of this type are available on HF hub:
 
-- [llama-13b-accelerator](https://huggingface.co/ibm-fms/llama-13b-accelerator)
-- [llama3-8b-accelerator](https://huggingface.co/ibm-fms/llama3-8b-accelerator)
-- [codellama-34b-accelerator](https://huggingface.co/ibm-fms/codellama-34b-accelerator)
-- [llama2-70b-accelerator](https://huggingface.co/ibm-fms/llama2-70b-accelerator)
-- [llama3-70b-accelerator](https://huggingface.co/ibm-fms/llama3-70b-accelerator)
+- [llama-13b-accelerator](https://huggingface.co/ibm-ai-platform/llama-13b-accelerator)
+- [llama3-8b-accelerator](https://huggingface.co/ibm-ai-platform/llama3-8b-accelerator)
+- [codellama-34b-accelerator](https://huggingface.co/ibm-ai-platform/codellama-34b-accelerator)
+- [llama2-70b-accelerator](https://huggingface.co/ibm-ai-platform/llama2-70b-accelerator)
+- [llama3-70b-accelerator](https://huggingface.co/ibm-ai-platform/llama3-70b-accelerator)
 - [granite-3b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator)
 - [granite-8b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator)
 - [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator)
diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
index 10d9de8cb..f227e71ba 100644
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -51,7 +51,7 @@ if __name__ == "__main__":
     # Create an LLM with spec decoding
     llm = LLM(
         model="meta-llama/Llama-2-13b-chat-hf",
-        speculative_model="ibm-fms/llama-13b-accelerator",
+        speculative_model="ibm-ai-platform/llama-13b-accelerator",
     )
 
     print("With speculation")
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 8a0ade4fa..7b5032f79 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -278,7 +278,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
     "MedusaModel": _HfExamplesInfo("JackFram/llama-68m",
                                    speculative_model="abhigoyal/vllm-medusa-llama-68m-random"),  # noqa: E501
     "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
-                                                    speculative_model="ibm-fms/llama-160m-accelerator"),  # noqa: E501
+                                                    speculative_model="ibm-ai-platform/llama-160m-accelerator"),  # noqa: E501
 }
 
 _FALLBACK_MODEL = {
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index a2b84b902..59beca47a 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -33,7 +33,7 @@ from .conftest import run_equality_correctness_test
 MAIN_MODEL = "JackFram/llama-160m"
 
 # speculative model
-SPEC_MODEL = "ibm-fms/llama-160m-accelerator"
+SPEC_MODEL = "ibm-ai-platform/llama-160m-accelerator"
 
 # max. number of speculative tokens: this corresponds to
 # n_predict in the config.json of the speculator model.
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index cf4123a2c..2920427f9 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -64,7 +64,7 @@ class MLPSpeculator(nn.Module):
     https://arxiv.org/pdf/2404.19124
 
     Trained speculators of this type are available on HF hub at:
-    https://huggingface.co/ibm-fms and https://huggingface.co/ibm-granite
+    https://huggingface.co/ibm-ai-platform and https://huggingface.co/ibm-granite
     """
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
-- 
GitLab


From 4896d0c2dd367efbdf8387028322bf5f74359930 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Tue, 4 Feb 2025 02:27:11 -0500
Subject: [PATCH 27/65] [Quant] Fix use_mla TypeError and support loading
 pure-sparsity Compressed Tensors configs (#12711)

---
 vllm/config.py                                               | 5 +++--
 .../quantization/compressed_tensors/compressed_tensors.py    | 5 +++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 2f4a7ad76..bc4bf627b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1000,8 +1000,9 @@ class ModelConfig:
         # have fp8 for both weights and activations.
         if self.quantization == "compressed-tensors":
             quant_config = self._parse_quant_hf_config()
-            for group_name, cfg in quant_config.get("config_groups",
-                                                    ("", {})).items():
+            for group_name, cfg in quant_config.get("config_groups", {
+                    "": {}
+            }).items():
                 act_cfg = cfg.get("input_activations", {})
                 act_type = None if act_cfg is None else act_cfg.get("type", "")
                 w_cfg = cfg.get("weights", {})
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 24f7542e1..1a11b2419 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -424,6 +424,11 @@ class CompressedTensorsConfig(QuantizationConfig):
                                          or input_quant is not None,
                                          weight_quant=weight_quant,
                                          input_quant=input_quant)
+        elif weight_quant is None:
+            logger.warning_once("Acceleration for non-quantized schemes is "
+                                "not supported by Compressed Tensors. "
+                                "Falling back to UnquantizedLinearMethod")
+            return None
         else:
             # Find the quant_scheme
             scheme = self._get_scheme_from_parts(  # type: ignore
-- 
GitLab


From c36ac98d0118537ec5f3f405a68311a10f9b59a5 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Tue, 4 Feb 2025 03:24:11 -0500
Subject: [PATCH 28/65] [AMD][ROCm] Enable DeepSeek model on ROCm (#12662)

Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
Co-authored-by: Matthew Wong <Matthew.Wong2@amd.com>
---
 tests/kernels/test_rocm_attention_selector.py | 31 +++++++++++++++++++
 tests/worker/test_model_runner.py             |  9 ++++++
 vllm/attention/backends/mla/utils.py          |  6 +++-
 .../layers/quantization/utils/fp8_utils.py    | 10 ++++++
 vllm/platforms/rocm.py                        |  3 ++
 5 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 tests/kernels/test_rocm_attention_selector.py

diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py
new file mode 100644
index 000000000..5848dc014
--- /dev/null
+++ b/tests/kernels/test_rocm_attention_selector.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from tests.kernels.utils import override_backend_env_variable
+from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.platforms.rocm import RocmPlatform
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
+def test_selector(monkeypatch):
+    """Test that the attention selector for ROCm.
+    """
+    override_backend_env_variable(monkeypatch, "ROCM_FLASH")
+
+    with patch("vllm.attention.selector.current_platform", RocmPlatform()):
+        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
+        assert backend.get_name() == "ROCM_FLASH"
+        # mla test for deepseek related
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
+                                   False, True)
+        assert backend.get_name() == "TRITON_MLA"
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index c32ceb4fa..3f9a0d6fa 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -24,6 +24,15 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
     return model_runner
 
 
+def test_deepseek_mla_attn_backend_module():
+    model_runner = _create_model_runner(
+        "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
+        trust_remote_code=True,
+        enable_chunked_prefill=False,
+    )
+    assert model_runner.attn_backend.__name__ == "TritonMLABackend"
+
+
 @pytest.mark.parametrize("batch_size", list(range(1, 257)))
 def test_prepare_prompt(batch_size):
     model_runner = _create_model_runner(
diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
index 9b63192ed..8e584cca3 100644
--- a/vllm/attention/backends/mla/utils.py
+++ b/vllm/attention/backends/mla/utils.py
@@ -27,7 +27,11 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     scaled_dequantize, scaled_quantize)
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
-from vllm.vllm_flash_attn import flash_attn_varlen_func
+
+try:
+    from vllm.vllm_flash_attn import flash_attn_varlen_func
+except ImportError:
+    from flash_attn import flash_attn_varlen_func
 
 
 @dataclass
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 29c7268ad..10ff71e57 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -47,6 +47,16 @@ def apply_w8a8_block_fp8_linear(
 
     shape_supported_by_cutlass = (weight.shape[0] % 128 == 0
                                   and weight.shape[1] % 128 == 0)
+    if current_platform.is_rocm():
+        scale_a_shape = ((input_2d.shape[-1] // block_size[1], ) +
+                         input_2d.shape[:-1])[::-1]
+        scale_b_shape = (weight_scale.view(-1, 1)
+                         if weight_scale.dim() <= 1 else weight_scale.T).shape
+        ar, ac = scale_a_shape
+        br, bc = scale_b_shape
+        if (ac > 1 or bc > 1 or ar not in (1, input_2d.shape[0])
+                or br not in (1, weight.shape[0])):
+            shape_supported_by_cutlass = False
     if cutlass_block_fp8_supported and shape_supported_by_cutlass:
         q_input, x_scale = per_token_group_quant_fp8(input_2d,
                                                      block_size[1],
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index cd851c0d8..035766289 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -79,6 +79,9 @@ class RocmPlatform(Platform):
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                              kv_cache_dtype, block_size, use_v1,
                              use_mla) -> str:
+        if use_mla:
+            logger.info("Using Triton MLA backend.")
+            return "vllm.attention.backends.triton_mla.TritonMLABackend"
         selected_backend = (_Backend.ROCM_FLASH if selected_backend
                             == _Backend.FLASH_ATTN else selected_backend)
         if selected_backend == _Backend.ROCM_FLASH:
-- 
GitLab


From 96b23621c16d4e3b65380c6af3a7d7bac79cfa5b Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 4 Feb 2025 16:27:36 +0800
Subject: [PATCH 29/65] [Misc] Add BNB quantization for Whisper (#12381)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/model_loader/loader.py | 102 ++++++++++++---------
 vllm/model_executor/model_loader/utils.py  |   7 ++
 vllm/model_executor/models/whisper.py      |  17 +++-
 3 files changed, 82 insertions(+), 44 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 809af81d7..19e3bc6a2 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -803,9 +803,11 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             iterator = safetensors_weights_iterator(hf_weights_files)
         else:
             iterator = pt_weights_iterator(hf_weights_files)
-        for name, param in iterator:
-            # mapping weight names from transformers to vllm.
-            yield self.weight_mapper(name), param
+        for org_name, param in iterator:
+            # mapping weight names from transformers to vllm while preserving
+            # original names.
+            mapped_name = self.weight_mapper(org_name)
+            yield org_name, mapped_name, param
 
     def _get_quantized_weights_iterator(
         self,
@@ -866,24 +868,30 @@ class BitsAndBytesModelLoader(BaseModelLoader):
 
     def _quantized_8bit_generator(self, hf_weights_files, use_safetensors,
                                   quant_state_dict) -> Generator:
-        for weight_name, weight_tensor in self._hf_weight_iter(
-                hf_weights_files, use_safetensors):
-            if not weight_name.lower().endswith(".scb"):
+        for (
+                org_weight_name,
+                mapped_weight_name,
+                weight_tensor,
+        ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
+            if not mapped_weight_name.lower().endswith(".scb"):
                 continue
 
-            weight_key = weight_name.lower().replace(".scb", ".weight")
+            weight_key = mapped_weight_name.lower().replace(".scb", ".weight")
             quant_state_dict[weight_key] = weight_tensor
 
-        for weight_name, weight_tensor in self._hf_weight_iter(
-                hf_weights_files, use_safetensors):
-            if self._is_8bit_weight_name(weight_name):
+        for (
+                org_weight_name,
+                mapped_weight_name,
+                weight_tensor,
+        ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
+            if self._is_8bit_weight_name(mapped_weight_name):
                 continue
 
-            if weight_name in quant_state_dict:
+            if mapped_weight_name in quant_state_dict:
                 set_weight_attrs(weight_tensor, {"load_in_8bit": True})
-                yield weight_name, weight_tensor
+                yield org_weight_name, weight_tensor
             else:
-                yield weight_name, weight_tensor
+                yield org_weight_name, weight_tensor
 
     def _quantized_4bit_generator(self, hf_weights_files, use_safetensors,
                                   quant_state_dict) -> Generator:
@@ -893,15 +901,19 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         weight_iterator = self._hf_weight_iter(hf_weights_files,
                                                use_safetensors)
         temp_state_dict = {}
-        for weight_name, weight_tensor in weight_iterator:
-            if not self._is_4bit_weight_name(weight_name):
+        for (
+                org_weight_name,
+                mapped_weight_name,
+                weight_tensor,
+        ) in weight_iterator:
+            if not self._is_4bit_weight_name(mapped_weight_name):
                 continue
             # bitsandbytes library requires
             # weight.quant_state.bitsandbytes__* in CPU
-            if "quant_state.bitsandbytes" in weight_name:
-                temp_state_dict[weight_name] = weight_tensor.cpu().data
+            if "quant_state.bitsandbytes" in mapped_weight_name:
+                temp_state_dict[mapped_weight_name] = weight_tensor.cpu().data
             else:
-                temp_state_dict[weight_name] = weight_tensor
+                temp_state_dict[mapped_weight_name] = weight_tensor
 
         # Closure to parse quant_state for each prequant weight
         def _parse_quant_state(param_name: str,
@@ -915,20 +927,24 @@ class BitsAndBytesModelLoader(BaseModelLoader):
 
         # Second iterate over all prequant and normal weights
         # pre quantized weights would have a quant_state
-        for weight_name, weight_tensor in self._hf_weight_iter(
-                hf_weights_files, use_safetensors):
-            if self._is_4bit_weight_name(weight_name):
+        for (
+                org_weight_name,
+                mapped_weight_name,
+                weight_tensor,
+        ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
+            if self._is_4bit_weight_name(mapped_weight_name):
                 continue
 
-            if (f"{weight_name}.quant_state.bitsandbytes__nf4"
+            if (f"{mapped_weight_name}.quant_state.bitsandbytes__nf4"
                     in temp_state_dict) or (
-                        f"{weight_name}.quant_state.bitsandbytes__fp4"
+                        f"{mapped_weight_name}.quant_state.bitsandbytes__fp4"
                         in temp_state_dict):
-                quant_state = _parse_quant_state(weight_name, temp_state_dict)
-                quant_state_dict[weight_name] = quant_state
-                yield weight_name, weight_tensor
+                quant_state = _parse_quant_state(mapped_weight_name,
+                                                 temp_state_dict)
+                quant_state_dict[mapped_weight_name] = quant_state
+                yield org_weight_name, weight_tensor
             else:
-                yield weight_name, weight_tensor
+                yield org_weight_name, weight_tensor
 
     def _unquantized_generator(self, hf_weights_files, use_safetensors,
                                quant_state_dict) -> Generator:
@@ -937,18 +953,22 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         tp_size = get_tensor_model_parallel_world_size()
         tp_rank = get_tensor_model_parallel_rank()
 
-        for weight_name, weight_tensor in self._hf_weight_iter(
-                hf_weights_files, use_safetensors):
-            if any(target_module in weight_name for target_module in
-                   self.target_modules) and weight_name.endswith(".weight"):
+        for (
+                org_weight_name,
+                mapped_weight_name,
+                weight_tensor,
+        ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
+            if any(target_module in mapped_weight_name
+                   for target_module in self.target_modules
+                   ) and mapped_weight_name.endswith(".weight"):
                 # Without sharding
                 if any(
-                        weight_name.startswith(module)
+                        mapped_weight_name.startswith(module)
                         for module in self.unsharded_weights_modules):
                     weight_sub_tensor = weight_tensor
                 # Shard by column
                 elif any(
-                        weight_name.startswith(module)
+                        mapped_weight_name.startswith(module)
                         for module in self.column_sharded_weights_modules):
                     total_size = weight_tensor.size(-1)
                     start_index = total_size // tp_size * tp_rank
@@ -958,14 +978,14 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                 # Weights have fused on disk. In this case, we assume that the
                 # weight and module use same name.
                 elif any(
-                        weight_name.startswith(module)
+                        mapped_weight_name.startswith(module)
                         for module in self.maybe_fused_weights_modules):
                     # special case for fused weights
                     # get the size of each shard weight tensor
                     total_shard_sizes = next(
                         (sizes for module, sizes in
                          self.maybe_fused_weights_modules.items()
-                         if weight_name.startswith(module)))
+                         if mapped_weight_name.startswith(module)))
                     total_size = weight_tensor.size(0)
                     assert total_size == sum(total_shard_sizes)
                     # get the start/end index of each shard weight tensor
@@ -1008,23 +1028,21 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                         quant_type="nf4",
                     )
 
-                quant_state_dict[weight_name] = quant_state
+                quant_state_dict[mapped_weight_name] = quant_state
             else:
                 processed_weight = weight_tensor
-
-            yield weight_name, processed_weight
+            yield org_weight_name, processed_weight
 
     def _get_bnb_target_modules(self, model: nn.Module) -> None:
 
         for name, module in model.named_modules():
             if isinstance(module, (LinearBase, )):
-                last_name = name.split(".")[-1]
-                if sub_modules := self.modules_mapping.packed_mapping.get(
-                        last_name, []):
+                if modules_info := self.modules_mapping.get_sub_modules(name):
                     # Map vllm's names to transformers's names.
+                    rep_name, sub_modules = modules_info
                     for sub_name in sub_modules:
                         self.target_modules.append(
-                            name.replace(last_name, sub_name))
+                            name.replace(rep_name, sub_name))
                 # Add original module name even if the module has stacked map,
                 # in case model has a mixture of disk-merged and disk-splitted
                 # weights with same last name.
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index eb334c1fd..7a82a695c 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -131,3 +131,10 @@ class ParamMapping:
                     packed_name,
                     index,
                 )
+
+    def get_sub_modules(self,
+                        module_name: str) -> Optional[Tuple[str, List[str]]]:
+        for key, value in self.packed_mapping.items():
+            if module_name.endswith(key):
+                return key, value
+        return None
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 2319c3160..0a3011d36 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -638,6 +638,19 @@ def input_mapper_for_whisper(
 @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
     "audio", get_max_whisper_audio_tokens)
 class WhisperForConditionalGeneration(nn.Module, SupportsMultiModal):
+    packed_modules_mapping = {
+        "self_attn.qkv_proj": [
+            "self_attn.q_proj",
+            "self_attn.k_proj",
+            "self_attn.v_proj",
+        ],
+        "encoder_attn.kv_proj": ["encoder_attn.k_proj", "encoder_attn.v_proj"],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={
+        ".fc1.": ".mlp.fc1.",
+        ".fc2.": ".mlp.fc2."
+    })
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -731,10 +744,10 @@ class WhisperForConditionalGeneration(nn.Module, SupportsMultiModal):
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."])
-        mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."})
+
         # add fake zeros bias for k_proj to state_dict
         weights = _create_fake_bias_for_k_proj(weights)
-        return loader.load_weights(weights, mapper=mapper)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
 def _create_fake_bias_for_k_proj(
-- 
GitLab


From d1ca7df84d9f8853001bdf1c2900321d9cb5d64e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 4 Feb 2025 16:44:52 +0800
Subject: [PATCH 30/65] [VLM] Merged multi-modal processor for InternVL-based
 models (#12553)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 docs/source/contributing/model/multimodal.md  |   6 +-
 docs/source/models/supported_models.md        |  10 +-
 .../vision_language/test_h2ovl.py             | 131 ---
 .../vision_language/test_models.py            |   2 +-
 .../vision_language/vlm_utils/model_utils.py  |  37 +-
 .../multimodal/processing/test_common.py      |   4 +-
 .../multimodal/processing/test_h2ovl.py       | 142 +++
 .../multimodal/processing/test_internvl.py    | 207 +----
 .../multimodal/processing/test_llava_next.py  |  15 +-
 .../processing/test_llava_onevision.py        |  15 +-
 .../multimodal/processing/test_phi3v.py       |   5 +-
 .../multimodal/processing/test_qwen2_vl.py    |   5 +-
 vllm/model_executor/models/aria.py            |   6 +-
 vllm/model_executor/models/blip2.py           |   6 +-
 vllm/model_executor/models/chameleon.py       |   6 +-
 vllm/model_executor/models/deepseek_vl2.py    |   6 +-
 vllm/model_executor/models/fuyu.py            |   6 +-
 vllm/model_executor/models/h2ovl.py           | 621 ++++++++-----
 vllm/model_executor/models/internvl.py        | 823 +++++++++++-------
 vllm/model_executor/models/llava.py           |   6 +-
 .../model_executor/models/llava_next_video.py |   6 +-
 vllm/model_executor/models/llava_onevision.py |   6 +-
 vllm/model_executor/models/minicpmo.py        |  28 +-
 vllm/model_executor/models/minicpmv.py        |  50 +-
 vllm/model_executor/models/nvlm_d.py          | 186 +++-
 vllm/model_executor/models/phi3v.py           |   6 +-
 vllm/model_executor/models/qwen.py            |  12 +-
 vllm/model_executor/models/qwen2_audio.py     |   6 +-
 vllm/model_executor/models/qwen2_vl.py        |  31 +-
 vllm/model_executor/models/ultravox.py        |   6 +-
 vllm/multimodal/inputs.py                     |  11 +
 vllm/multimodal/processing.py                 |   6 +-
 vllm/multimodal/profiling.py                  |   3 +-
 vllm/multimodal/registry.py                   |   4 +-
 34 files changed, 1434 insertions(+), 986 deletions(-)
 delete mode 100644 tests/models/decoder_only/vision_language/test_h2ovl.py
 create mode 100644 tests/models/multimodal/processing/test_h2ovl.py

diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 6c6f3b701..66a7554da 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -250,7 +250,11 @@ def get_max_image_tokens(self) -> int:
 And thus, we can override the method as:
 
 ```python
-def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+def get_mm_max_tokens_per_item(
+    self,
+    seq_len: int,
+    mm_counts: Mapping[str, int],
+) -> Mapping[str, int]:
     return {"image": self.get_max_image_tokens()}
 ```
 
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 4a0996469..fbdca189a 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -726,7 +726,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
   *
   * ✅︎
-  *
+  * \*
 - * `Idefics3ForConditionalGeneration`
   * Idefics3
   * T + I
@@ -799,7 +799,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
 - * `NVLM_D_Model`
   * NVLM-D 1.0
-  * T + I<sup>E+</sup>
+  * T + I<sup>+</sup>
   * `nvidia/NVLM-D-72B`, etc.
   *
   * ✅︎
@@ -859,7 +859,11 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 :::{note}
-To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
+To use DeepSeek-VL2 series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
+:::
+
+:::{note}
+H2O-VL series models will be available in V1 once we support backends other than FlashAttention.
 :::
 
 :::{note}
diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py
deleted file mode 100644
index 9590adf6f..000000000
--- a/tests/models/decoder_only/vision_language/test_h2ovl.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import Optional, Tuple
-
-import pytest
-import torch
-from PIL.Image import Image
-from transformers import AutoConfig
-
-# Import the functions to test
-from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
-                                              image_to_pixel_values_wrapper)
-from vllm.multimodal.image import rescale_image_size
-
-models = [
-    "h2oai/h2ovl-mississippi-800m",  # Replace with your actual model names
-    "h2oai/h2ovl-mississippi-2b",
-]
-
-
-def run_preprocessing_test(
-    image: Image,
-    config,
-    max_dynamic_patch: Optional[int] = None,
-) -> Tuple[torch.Tensor, int]:
-    """Test the image preprocessing and calculate expected blocks."""
-
-    if max_dynamic_patch is None:
-        max_dynamic_patch = config.max_dynamic_patch
-
-    width, height = image.size
-    use_MSAC = config.use_msac
-
-    # Create the mapper function with the provided configuration
-    mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_MSAC)
-    pixel_values = mapper(image)
-
-    # Calculate the expected number of blocks
-    if use_MSAC:
-        # First pass
-        blocks1, _, _, aspect_ratio = calculate_num_blocks(
-            width,
-            height,
-            config.min_dynamic_patch,
-            max_dynamic_patch,
-            config.vision_config.image_size,
-            use_thumbnail=False,  # Thumbnail is handled separately
-            prior_aspect_ratio=None,
-        )
-
-        # Second pass
-        blocks2, _, _, _ = calculate_num_blocks(
-            width,
-            height,
-            config.min_dynamic_patch,
-            max_dynamic_patch,
-            config.vision_config.image_size,
-            use_thumbnail=False,
-            prior_aspect_ratio=aspect_ratio,
-        )
-
-        # Add thumbnail if use_thumbnail is True and total_blocks > 1
-        if config.use_thumbnail:
-            blocks1 += 1 if blocks1 > 1 else 0
-            blocks2 += 1 if blocks2 > 1 else 0
-
-        # Total blocks is the sum of blocks from both passes minus overlapping
-        total_blocks = blocks1 + blocks2 - 1
-
-        expected_blocks = total_blocks
-
-    else:
-        blocks, _, _, _ = calculate_num_blocks(
-            width,
-            height,
-            config.min_dynamic_patch,
-            max_dynamic_patch,
-            config.vision_config.image_size,
-            use_thumbnail=False,
-            prior_aspect_ratio=None,
-        )
-        expected_blocks = blocks
-
-        if config.use_thumbnail and expected_blocks > 1:
-            expected_blocks += 1
-
-    return pixel_values, expected_blocks
-
-
-@pytest.mark.parametrize("model_name", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("max_dynamic_patch", [None, 2, 4, 8])
-def test_image_preprocessing(image_assets, model_name, size_factors,
-                             max_dynamic_patch):
-    """Test image preprocessing pipeline with different configurations."""
-    # Load the configuration from the model
-    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-
-    for asset in image_assets:
-        image = asset.pil_image
-        for factor in size_factors:
-            scaled_image = rescale_image_size(image, factor)
-
-            # Test preprocessing and get expected number of blocks
-            pixel_values, expected_blocks = run_preprocessing_test(
-                scaled_image, config, max_dynamic_patch)
-
-            # Verify output shapes and properties
-            actual_blocks = pixel_values.shape[0]
-            assert actual_blocks == expected_blocks, (
-                f"Expected {expected_blocks} blocks, got {actual_blocks}")
-
-            # Check image dimensions
-            expected_size = (
-                3,  # Number of channels (C, H, W)
-                config.vision_config.image_size,
-                config.vision_config.image_size,
-            )
-            for img in pixel_values:
-                assert img.shape == expected_size, (
-                    f"Expected image size {expected_size}, got {img.shape}")
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index e3cda8971..7a14ba2f3 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -250,6 +250,7 @@ VLM_TEST_SETTINGS = {
         max_model_len=8192,
         dtype="bfloat16",
         use_tokenizer_eos=True,
+        num_logprobs=10,
         patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
     ),
     "idefics3": VLMTestInfo(
@@ -282,7 +283,6 @@ VLM_TEST_SETTINGS = {
         dtype="bfloat16",
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
-        marks=[large_gpu_mark(min_gb=32)],
     ),
     "llava_next": VLMTestInfo(
         models=["llava-hf/llava-v1.6-mistral-7b-hf"],
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index b0a88161c..d2401b222 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -334,12 +334,12 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
         def __init__(self, hf_runner: HfRunner):
             self.num_image_token = hf_runner.model.num_image_token
             self.tokenizer = hf_runner.tokenizer
-            self.dtype = hf_runner.model.dtype
 
             self.config = AutoConfig.from_pretrained(hf_runner.model_name,
                                                      trust_remote_code=True)
             self.vision_config = self.config.vision_config
             self.use_thumbnail = self.config.use_thumbnail
+            self.use_msac = self.config.use_msac
             self.min_num = self.config.min_dynamic_patch
             self.max_num = self.config.max_dynamic_patch
             self.image_size = self.vision_config.image_size
@@ -348,18 +348,19 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                      **kwargs):
             # yapf: disable
             from vllm.model_executor.models.h2ovl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
 
             # yapf: enable
             images = [images] if isinstance(images, Image) else images
             pixel_values = [
-                image_to_pixel_values(image,
-                                      self.image_size,
-                                      self.min_num,
-                                      self.max_num,
-                                      self.use_thumbnail,
-                                      use_MSAC=self.config.use_msac).to(
-                                          self.dtype) for image in images
+                image_to_pixel_values_h2ovl(
+                    image,
+                    input_size=self.image_size,
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                    use_msac=self.use_msac,
+                ) for image in images
             ]
             num_patches_list = [
                 pixel_value.shape[0] for pixel_value in pixel_values
@@ -394,7 +395,6 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
         def __init__(self, hf_runner: HfRunner):
             self.num_image_token = hf_runner.model.num_image_token
             self.tokenizer = hf_runner.tokenizer
-            self.dtype = hf_runner.model.dtype
 
             self.config = AutoConfig.from_pretrained(hf_runner.model_name,
                                                      trust_remote_code=True)
@@ -407,13 +407,17 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
         def __call__(self, text: str, images: Union[Image, List[Image]],
                      **kwargs):
             from vllm.model_executor.models.internvl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+                IMG_CONTEXT, IMG_END, IMG_START,
+                image_to_pixel_values_internvl)
             images = [images] if isinstance(images, Image) else images
             pixel_values = [
-                image_to_pixel_values(image, self.image_size, self.min_num,
-                                      self.max_num,
-                                      self.use_thumbnail).to(self.dtype)
-                for image in images
+                image_to_pixel_values_internvl(
+                    image,
+                    input_size=self.image_size,
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                ) for image in images
             ]
             num_patches_list = [
                 pixel_value.shape[0] for pixel_value in pixel_values
@@ -448,7 +452,8 @@ def _internvl_generate(
 ) -> torch.LongTensor:
     """Generate method for InternVL2 model without fixed use_cache."""
     assert self.img_context_token_id is not None
-    vit_embeds = self.extract_feature(pixel_values)
+    target_dtype = next(self.parameters()).dtype
+    vit_embeds = self.extract_feature(pixel_values.to(target_dtype))
     input_embeds = self.language_model.get_input_embeddings()(input_ids)
     B, N, C = input_embeds.shape
     input_embeds = input_embeds.reshape(B * N, C)
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 3921d4e19..07906a71d 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -141,13 +141,14 @@ def _test_processing_correctness(
 
 
 # yapf: disable
-# True if the model supports multiple data items of the modality per request
 @pytest.mark.parametrize("model_id", [
     "rhymes-ai/Aria",
     "Salesforce/blip2-opt-2.7b",
     "facebook/chameleon-7b",
     "deepseek-ai/deepseek-vl2-tiny",
     "adept/fuyu-8b",
+    "h2oai/h2ovl-mississippi-800m",
+    "OpenGVLab/InternVL2-1B",
     "llava-hf/llava-1.5-7b-hf",
     "llava-hf/llava-v1.6-mistral-7b-hf",
     "llava-hf/LLaVA-NeXT-Video-7B-hf",
@@ -156,6 +157,7 @@ def _test_processing_correctness(
     "mistral-community/pixtral-12b",
     "openbmb/MiniCPM-o-2_6",
     "openbmb/MiniCPM-V-2_6",
+    "nvidia/NVLM-D-72B",
     "Qwen/Qwen-VL-Chat",
     "Qwen/Qwen2-VL-2B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
new file mode 100644
index 000000000..767ac5eb9
--- /dev/null
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for H2OVL's multimodal preprocessing kwargs."""
+from typing import Optional
+
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.utils import cached_get_tokenizer
+
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", [
+    "h2oai/h2ovl-mississippi-800m",
+    "h2oai/h2ovl-mississippi-2b",
+])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("max_dynamic_patch", [1, 2, 4, 8])
+@pytest.mark.parametrize("dynamic_image_size", [True, False])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_override(
+    model_id: str,
+    image_assets: _ImageAssets,
+    size_factors: list[int],
+    max_dynamic_patch: int,
+    dynamic_image_size: Optional[bool],
+    num_imgs: int,
+):
+    from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
+                                                  get_h2ovl_target_ratios)
+
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        trust_remote_code=True,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    tokenizer = cached_get_tokenizer(
+        ctx.model_config.tokenizer,
+        trust_remote_code=ctx.model_config.trust_remote_code,
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=tokenizer,
+    )
+
+    config = processor.info.get_hf_config()
+    use_msac = config.use_msac
+
+    mm_processor_kwargs = {
+        "max_dynamic_patch": max_dynamic_patch,
+    }
+    if dynamic_image_size is not None:
+        mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
+
+    min_num = config.min_dynamic_patch
+    max_num = max_dynamic_patch if dynamic_image_size else 1
+
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<image>" * num_imgs
+
+    for asset in image_assets:
+        for factor in size_factors:
+            image = rescale_image_size(asset.pil_image, factor)
+            mm_data = {"image": [image] * num_imgs}
+
+            width, height = image.size
+
+            # Calculate the expected number of blocks
+            if num_imgs == 1 and use_msac:
+                # First pass
+                blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
+                    orig_width=width,
+                    orig_height=height,
+                    target_ratios=get_h2ovl_target_ratios(
+                        min_num,
+                        max_num,
+                        prior_aspect_ratio=None,
+                    ),
+                    image_size=config.vision_config.image_size,
+                    use_thumbnail=False,  # Thumbnail is handled separately
+                )
+
+                # Second pass
+                blocks2, _, _, _ = calculate_h2ovl_targets(
+                    orig_width=width,
+                    orig_height=height,
+                    target_ratios=get_h2ovl_target_ratios(
+                        min_num,
+                        max_num,
+                        prior_aspect_ratio=aspect_ratio,
+                    ),
+                    image_size=config.vision_config.image_size,
+                    use_thumbnail=False,
+                )
+
+                # Add thumbnail if use_thumbnail is True and total_blocks > 1
+                if config.use_thumbnail:
+                    blocks1 += 1 if blocks1 > 1 else 0
+                    blocks2 += 1 if blocks2 > 1 else 0
+
+                # Total blocks is the sum of blocks from both passes minus
+                # overlapping
+                total_blocks = blocks1 + blocks2 - 1
+
+                expected_num_patches = total_blocks
+            else:
+                blocks, _, _, _ = calculate_h2ovl_targets(
+                    orig_width=width,
+                    orig_height=height,
+                    target_ratios=get_h2ovl_target_ratios(
+                        min_num,
+                        max_num,
+                        prior_aspect_ratio=None,
+                    ),
+                    image_size=config.vision_config.image_size,
+                    use_thumbnail=False,
+                )
+                expected_num_patches = blocks
+
+                if config.use_thumbnail and expected_num_patches != 1:
+                    expected_num_patches += 1
+
+            processed_inputs = processor.apply(prompt, mm_data,
+                                               mm_processor_kwargs)
+            pixel_shape = (
+                processed_inputs["mm_kwargs"]["pixel_values_flat"].shape)
+
+            assert pixel_shape[0] == expected_num_patches * num_imgs
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index 0d921e9d3..ede961225 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -1,207 +1,64 @@
 # SPDX-License-Identifier: Apache-2.0
 """Tests for InternVL's multimodal preprocessing kwargs."""
-from typing import Callable, Optional
+from typing import Optional
 
 import pytest
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputContext, token_inputs
-from vllm.multimodal import MultiModalRegistry
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
 
-models = ["OpenGVLab/InternVL2-2B"]
 
-
-# Wrap lazy imports to avoid initializing CUDA during test collection
-@pytest.fixture()
-def input_processor_for_internvl():
-    from vllm.model_executor.models.internvl import InternVLInputPipeline
-
-    pipeline = InternVLInputPipeline('<img>', '</img>', '<IMG_CONTEXT>')
-    return pipeline.input_processor
-
-
-@pytest.fixture()
-def dummy_data_for_internvl():
-    from vllm.model_executor.models.internvl import InternVLInputPipeline
-
-    pipeline = InternVLInputPipeline('<img>', '</img>', '<IMG_CONTEXT>')
-    return pipeline.dummy_data
-
-
-@pytest.fixture()
-def get_max_internvl_image_tokens():
-    from vllm.model_executor.models.internvl import (
-        get_max_internvl_image_tokens)
-    return get_max_internvl_image_tokens
-
-
-@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("model_id", ["OpenGVLab/InternVL2-2B"])
 @pytest.mark.parametrize("max_dynamic_patch", [1, 4])
 @pytest.mark.parametrize("dynamic_image_size", [True, False, None])
-def test_input_mapper_override(
-    model: str,
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_override(
+    model_id: str,
     image_assets: _ImageAssets,
     max_dynamic_patch: int,
     dynamic_image_size: Optional[bool],
-):
-    mm_processor_kwargs = {
-        "max_dynamic_patch": max_dynamic_patch,
-    }
-    if dynamic_image_size is not None:
-        mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
-
-    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
-    if dynamic_image_size is False:
-        expected_num_patches = 1
-
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=mm_processor_kwargs,
-    )
-
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-
-    image = image_assets[0].pil_image.resize((448 * 2, 448 * 2))
-    vllm_result = mm_registry.map_input(
-        ctx.model_config,
-        {"image": image},
-    )
-    assert vllm_result["pixel_values"].size(1) == expected_num_patches
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None])
-@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
-def test_max_tokens_override(
-    get_max_internvl_image_tokens: Callable,
-    model: str,
-    max_dynamic_patch: Optional[int],
-    dynamic_image_size: Optional[bool],
-):
-    """Ensure get_max_internvl_image_tokens handles mm_processor_kwargs."""
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-
-    if max_dynamic_patch is None:
-        max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch
-    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
-    if dynamic_image_size is False:
-        expected_num_patches = 1
-    expected_max_tokens = 256 * expected_num_patches
-
-    actual_max_tokens = get_max_internvl_image_tokens(
-        ctx=InputContext(ctx.model_config),
-        max_dynamic_patch=max_dynamic_patch,
-        dynamic_image_size=dynamic_image_size,
-    )
-    assert expected_max_tokens == actual_max_tokens
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_imgs", [1, 2])
-@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None])
-@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
-def test_dummy_data_override(
-    dummy_data_for_internvl: Callable,
-    model: str,
     num_imgs: int,
-    max_dynamic_patch: Optional[int],
-    dynamic_image_size: Optional[bool],
 ):
-    """Ensure dummy_data_for_internvl handles kwargs properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the dummy data func.
     ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
+        model_name=model_id,
+        tokenizer_name=model_id,
         trust_remote_code=True,
         mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
     )
-
-    if max_dynamic_patch is None:
-        max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch
-    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
-    if dynamic_image_size is False:
-        expected_num_patches = 1
-    expected_max_tokens = 256 * expected_num_patches
-
-    dummy_data = dummy_data_for_internvl(
-        ctx=ctx,
-        seq_len=8192,  # Should be bigger than num_imgs * toks_per_img
-        mm_counts={"image": num_imgs},
-        max_dynamic_patch=max_dynamic_patch,
-        dynamic_image_size=dynamic_image_size,
+    tokenizer = cached_get_tokenizer(
+        ctx.model_config.tokenizer,
+        trust_remote_code=ctx.model_config.trust_remote_code,
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=tokenizer,
     )
-    sequence_data = dummy_data.seq_data
-
-    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
-    image_token_id = tokenizer.encode('<IMG_CONTEXT>',
-                                      add_special_tokens=False)[0]
 
-    # Ensure we have the right number of placeholders per size
-    img_tok_count = sequence_data.get_token_ids().count(image_token_id)
-    assert img_tok_count == expected_max_tokens * num_imgs
+    mm_processor_kwargs = {
+        "max_dynamic_patch": max_dynamic_patch,
+    }
+    if dynamic_image_size is not None:
+        mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
 
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<image>" * num_imgs
+    image = image_assets[0].pil_image.resize((448 * 2, 448 * 2))
+    mm_data = {"image": [image] * num_imgs}
 
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("max_dynamic_patch", [1, 4])
-@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
-@pytest.mark.parametrize("num_imgs", [1, 2])
-def test_input_processor_override(
-    input_processor_for_internvl: Callable,
-    image_assets: _ImageAssets,
-    model: str,
-    num_imgs: int,
-    max_dynamic_patch: int,
-    dynamic_image_size: Optional[bool],
-):
-    """Ensure input_processor_for_internvl handles kwargs properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the custom input processor.
     expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
     if dynamic_image_size is False:
         expected_num_patches = 1
 
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-    expected_toks_per_img = 256 * expected_num_patches
-
-    # Build the image str / prompt based on the number of images we pass
-    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
-    placeholders = "<image>" if num_imgs == 1 else "\n".join(
-        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
-    prompt = placeholders
-    images = [image_assets[0].pil_image.resize((448 * 2, 448 * 2))] * num_imgs
-
-    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
-                          prompt=prompt,
-                          multi_modal_data={"image": images})
-
-    processed_inputs = input_processor_for_internvl(
-        ctx,
-        inputs,
-        max_dynamic_patch=max_dynamic_patch,
-        dynamic_image_size=dynamic_image_size,
-    )
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
-    image_token_id = tokenizer.encode('<IMG_CONTEXT>',
-                                      add_special_tokens=False)[0]
+    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    assert img_tok_count == expected_toks_per_img * num_imgs
+    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+
+    assert img_tok_count == 256 * expected_num_patches * num_imgs
+    assert pixel_shape[0] == expected_num_patches * num_imgs
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index d2497e62d..fe4754c2e 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -43,7 +43,10 @@ def test_processor_max_tokens(model_id):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
-        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+        tokenizer=cached_get_tokenizer(
+            ctx.model_config.tokenizer,
+            trust_remote_code=ctx.model_config.trust_remote_code,
+        ),
     )
     info = processor.info
 
@@ -143,7 +146,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
-        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+        tokenizer=cached_get_tokenizer(
+            ctx.model_config.tokenizer,
+            trust_remote_code=ctx.model_config.trust_remote_code,
+        ),
     )
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
@@ -173,7 +179,10 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
-        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+        tokenizer=cached_get_tokenizer(
+            ctx.model_config.tokenizer,
+            trust_remote_code=ctx.model_config.trust_remote_code,
+        ),
     )
 
     seen_aspect_ratios = set[float]()
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index bd4dbd46d..fb650d9e0 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -44,7 +44,10 @@ def test_processor_max_tokens(model_id):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
-        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+        tokenizer=cached_get_tokenizer(
+            ctx.model_config.tokenizer,
+            trust_remote_code=ctx.model_config.trust_remote_code,
+        ),
     )
     info = processor.info
 
@@ -143,7 +146,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
-        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+        tokenizer=cached_get_tokenizer(
+            ctx.model_config.tokenizer,
+            trust_remote_code=ctx.model_config.trust_remote_code,
+        ),
     )
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
@@ -174,7 +180,10 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
-        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+        tokenizer=cached_get_tokenizer(
+            ctx.model_config.tokenizer,
+            trust_remote_code=ctx.model_config.trust_remote_code,
+        ),
     )
 
     seen_aspect_ratios = set[float]()
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index 44edec457..dde8904f2 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -38,7 +38,10 @@ def test_processor_override(
         trust_remote_code=True,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    tokenizer = cached_get_tokenizer(
+        ctx.model_config.tokenizer,
+        trust_remote_code=ctx.model_config.trust_remote_code,
+    )
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
         tokenizer=tokenizer,
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index 47c9b0add..ef8e97f82 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -33,7 +33,10 @@ def test_processor_override(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    tokenizer = cached_get_tokenizer(
+        ctx.model_config.tokenizer,
+        trust_remote_code=ctx.model_config.trust_remote_code,
+    )
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
         tokenizer=tokenizer,
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 97502c38b..98df532aa 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -399,7 +399,11 @@ class AriaProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         return {"image": self.get_num_image_tokens()}
 
     def get_num_image_tokens(self) -> int:
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 2b0452222..0463a0b97 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -407,7 +407,11 @@ class Blip2ProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         return {"image": self.get_num_image_tokens()}
 
     def get_num_image_tokens(self) -> int:
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 9061a3128..b29dd65a8 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -64,7 +64,11 @@ class ChameleonProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         return {"image": self.get_num_image_tokens()}
 
     def get_num_image_tokens(self) -> int:
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 1343b9762..0eaf3a620 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -165,7 +165,11 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
                                 image_width=x[1], image_height=x[0]))
         return ImageSize(width=width, height=height)
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         max_image_size = self.get_image_size_with_most_features()
         max_image_tokens = self.get_num_image_tokens(
             image_height=max_image_size.height,
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 6d8c82968..50b5ef35d 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -80,7 +80,11 @@ class FuyuProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         target_width, target_height = self.get_image_size_with_most_features()
 
         max_ncols, max_nrows = self.get_image_feature_grid_size(
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 91c89b159..cf3e777a2 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -7,43 +7,55 @@
 # Copyright (c) 2024 H2O.AI
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
-from functools import partial
-from typing import List, Optional, Tuple
+from typing import Mapping, Optional
 
 import torch
 from PIL import Image
 from transformers import PretrainedConfig
 
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.utils import is_list_of
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (ProcessingCache, PromptReplacement,
+                                        PromptReplacementDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .intern_vit import InternVisionModel
-from .internvl import (IMG_CONTEXT, IMG_END, IMG_START, InternVLChatModel,
-                       InternVLInputPipeline, build_transform,
-                       find_closest_aspect_ratio, get_internvl_num_patches)
+from .internvl import (IMG_CONTEXT, IMG_END, IMG_START,
+                       BaseInternVLProcessingInfo, BaseInternVLProcessor,
+                       InternVLChatModel, InternVLDummyInputsBuilder,
+                       InternVLMultiModalProcessor, build_transform,
+                       find_closest_aspect_ratio, get_internvl_target_ratios)
 
+logger = init_logger(__name__)
 
-# modified to include blocks generated in second pass
-def calculate_num_blocks(
-    orig_width: int,
-    orig_height: int,
-    min_num: int,
-    max_num: int,
-    image_size: int,
+
+def resolve_h2ovl_min_max_num(
+    *,
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
     use_thumbnail: bool,
-    prior_aspect_ratio=None,
-) -> Tuple[int, int, int, Tuple[int, int]]:
-    aspect_ratio = orig_width / orig_height
+) -> tuple[int, int]:
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
 
-    # calculate the existing image aspect ratio
-    target_ratios = set((i, j) for n in range(min_num, max_num + 1)
-                        for i in range(1, n + 1) for j in range(1, n + 1)
-                        if i * j <= max_num and i * j >= min_num)
-    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    return min_dynamic_patch, max_dynamic_patch
+
+
+def get_h2ovl_target_ratios(
+    min_num: int,
+    max_num: int,
+    *,
+    prior_aspect_ratio: Optional[tuple[int, int]],
+) -> list[tuple[int, int]]:
+    target_ratios = get_internvl_target_ratios(min_num, max_num)
 
     # if prior_aspect_ratio is provided, filter the target ratios
     if prior_aspect_ratio is not None:
@@ -52,44 +64,66 @@ def calculate_num_blocks(
             ratio[0] != 0 and prior_aspect_ratio[1] % ratio[1] != 0
         ]
 
+    return target_ratios
+
+
+# modified to include blocks generated in second pass
+def calculate_h2ovl_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int, tuple[int, int]]:
+    aspect_ratio = orig_width / orig_height
+
     # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio,
-                                                    target_ratios, orig_width,
-                                                    orig_height, image_size)
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
 
     # calculate the target width and height
     target_width = image_size * target_aspect_ratio[0]
     target_height = image_size * target_aspect_ratio[1]
     blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-    # add thumbnail image if num_blocks > 1
-    if use_thumbnail and blocks > 1:
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
         blocks += 1
+
     return blocks, target_width, target_height, target_aspect_ratio
 
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-# refactored to handle prior_aspect_ratio as optional
-def dynamic_preprocess(
+# refactored to handle prior_aspect_ratio
+def dynamic_preprocess_h2ovl(
     image: Image.Image,
-    min_num: int,
-    max_num: int,
+    *,
+    target_ratios: list[tuple[int, int]],
     image_size: int,
     use_thumbnail: bool,
-    prior_aspect_ratio: Optional[Tuple[int, int]] = None,
-) -> Tuple[List[Image.Image], Tuple[int, int]]:
+) -> tuple[list[Image.Image], tuple[int, int]]:
     orig_width, orig_height = image.size
 
-    # calculate the number of blocks based on prior aspect ratio if available
-    blocks, target_width, target_height, target_aspect_ratio = (
-        calculate_num_blocks(
-            orig_width,
-            orig_height,
-            min_num,
-            max_num,
-            image_size,
-            use_thumbnail=False,
-            prior_aspect_ratio=prior_aspect_ratio,
-        ))
+    # calculate the number of blocks without thumbnail
+    (
+        blocks,
+        target_width,
+        target_height,
+        target_aspect_ratio,
+    ) = calculate_h2ovl_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
     # resize the image
     resized_img = image.resize((target_width, target_height))
     processed_images = []
@@ -103,276 +137,393 @@ def dynamic_preprocess(
         # split the image
         split_img = resized_img.crop(box)
         processed_images.append(split_img)
+
     assert len(processed_images) == blocks
+
     if use_thumbnail and len(processed_images) != 1:
         thumbnail_img = image.resize((image_size, image_size))
         processed_images.append(thumbnail_img)
+
     return processed_images, target_aspect_ratio
 
 
-def load_image(
+def _preprocess_image(
     image: Image.Image,
-    input_size=448,
-    min_num=1,
-    max_num=6,
-    use_thumbnail=True,
-    prior_aspect_ratio: Optional[Tuple[int, int]] = None,
-) -> Tuple[torch.Tensor, Tuple[int, int]]:
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+    prior_aspect_ratio: Optional[tuple[int, int]],
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    target_ratios = get_h2ovl_target_ratios(
+        min_num,
+        max_num,
+        prior_aspect_ratio=prior_aspect_ratio,
+    )
+
     transform = build_transform(input_size=input_size)
-    images, target_aspect_ratio = dynamic_preprocess(
+    images, target_aspect_ratio = dynamic_preprocess_h2ovl(
         image,
         image_size=input_size,
         use_thumbnail=use_thumbnail,
-        min_num=min_num,
-        max_num=max_num,
-        prior_aspect_ratio=prior_aspect_ratio,
+        target_ratios=target_ratios,
     )
-    pixel_values = [transform(image) for image in images]
-    pixel_values = torch.stack(pixel_values)
+
+    pixel_values = torch.stack([transform(image) for image in images])
     return pixel_values, target_aspect_ratio
 
 
-# refactored to use the combined load_image function
-def image_to_pixel_values(
+# refactored to use the _preprocess_image function
+def image_to_pixel_values_h2ovl(
     image: Image.Image,
+    *,
     input_size: int,
     min_num: int,
     max_num: int,
     use_thumbnail: bool,
-    use_MSAC: bool,
+    use_msac: bool,
 ) -> torch.Tensor:
     # when MSAC is turned on, we need to process the image twice
-    if use_MSAC:
+    if use_msac:
         # first pass
-        pixel_values, target_aspect_ratio = load_image(
+        pixel_values1, aspect_ratio1 = _preprocess_image(
             image,
             input_size=input_size,
             min_num=min_num,
             max_num=max_num,
             use_thumbnail=True,
+            prior_aspect_ratio=None,
         )
         # second pass
-        pixel_values2, _ = load_image(
+        pixel_values2, _ = _preprocess_image(
             image,
             input_size=input_size,
-            min_num=min_num,
+            min_num=3,  # Hardcoded value
             max_num=max_num,
-            prior_aspect_ratio=target_aspect_ratio,
+            use_thumbnail=True,
+            prior_aspect_ratio=aspect_ratio1,
         )
         # combine pixel values
         pixel_values = torch.cat(
-            [pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0)
+            [pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0)
 
     else:
-        pixel_values, _ = load_image(
+        pixel_values, _ = _preprocess_image(
             image,
             input_size=input_size,
             min_num=min_num,
             max_num=max_num,
             use_thumbnail=use_thumbnail,
+            prior_aspect_ratio=None,
         )
 
     return pixel_values
 
 
-def image_to_pixel_values_wrapper(hf_config: PretrainedConfig,
-                                  max_dynamic_patch: Optional[int] = None,
-                                  use_MSAC: Optional[bool] = None):
-    image_size = hf_config.vision_config.image_size
-    min_num = hf_config.min_dynamic_patch
-    if max_dynamic_patch is None:
-        max_dynamic_patch = hf_config.max_dynamic_patch
-    if use_MSAC is None:
-        use_MSAC = hf_config.use_msac
-    use_thumbnail = hf_config.use_thumbnail
-    return partial(
-        image_to_pixel_values,
-        input_size=image_size,
-        min_num=min_num,
-        max_num=max_dynamic_patch,
-        use_thumbnail=use_thumbnail,
-        use_MSAC=use_MSAC,
-    )
-
-
-def get_max_internvl_image_tokens(ctx: InputContext,
-                                  *,
-                                  max_dynamic_patch: Optional[int] = None):
-    """
-    Calculate the maximum number of tokens with/without MSAC and thumbnail
-    """
-    hf_config = ctx.get_hf_config()
-    use_thumbnail = hf_config.use_thumbnail
-    use_MSAC = hf_config.use_msac
+class H2OVLProcessor(BaseInternVLProcessor):
 
-    if max_dynamic_patch is None:
-        max_dynamic_patch = hf_config.max_dynamic_patch
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+        *,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_msac: Optional[bool] = None,
+    ) -> None:
+        super().__init__(
+            config,
+            tokenizer,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
 
-    num_patches = get_internvl_num_patches(hf_config)
+        if use_msac is None:
+            use_msac = config.use_msac
+        assert isinstance(use_msac, bool)
 
-    coefficient = 2 if use_MSAC else 1
-    num_blocks = coefficient * max_dynamic_patch + (1 if use_thumbnail else 0)
+        self.use_msac = use_msac
 
-    return num_blocks * num_patches
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
 
+    def get_image_repl_features(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> str:
+        return IMG_CONTEXT * feature_size
 
-class H2OVLInputPipeline(InternVLInputPipeline):
-    """
-    Input pipeline for processing image and text data for the H2OVL model.
-    """
+    def get_image_repl_full(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> str:
+        features = self.get_image_repl_features(feature_size, num_patches)
+        return IMG_START + features + IMG_END
 
-    def input_processor(
+    def resolve_min_max_num(
         self,
-        ctx: InputContext,
-        inputs: DecoderOnlyInputs,
         *,
         max_dynamic_patch: Optional[int] = None,
-    ) -> DecoderOnlyInputs:
-        # get multi_modal_data
-        multi_modal_data = inputs.get("multi_modal_data")
-        if multi_modal_data is None or "image" not in multi_modal_data:
-            return inputs
-
-        model_config = ctx.model_config
-        hf_config = ctx.get_hf_config()
-        use_MSAC = hf_config.use_msac
-
-        image_data = multi_modal_data["image"]
-        num_patches = get_internvl_num_patches(hf_config)
-
-        image_pixel_values_mapper = image_to_pixel_values_wrapper(
-            hf_config, max_dynamic_patch=max_dynamic_patch)
-
-        # single image
-        if isinstance(image_data, Image.Image):
-            pixel_values = image_pixel_values_mapper(image_data,
-                                                     use_MSAC=use_MSAC)
-            num_blocks = pixel_values.shape[0]
-            image_feature_sizes = [num_blocks * num_patches]
-            pixel_values = pixel_values.unsqueeze(0)
-
-        # multi images
-        elif is_list_of(image_data, Image.Image):
-            # Do not use MSAC for multi images
-            image_feature_sizes = []
-            pixel_values = [
-                image_pixel_values_mapper(image, use_MSAC=False)
-                for image in image_data
-            ]
-            for pixel_value in pixel_values:
-                num_blocks = pixel_value.shape[0]
-                image_feature_sizes.append(num_blocks * num_patches)
-
-        # image embeddings as input
-        elif isinstance(image_data, torch.Tensor):
-            _, image_feature_size, _ = image_data.shape
-            image_feature_sizes = [image_feature_size]
-            pixel_values = None
-
-        # multi-image image embeddings
-        elif is_list_of(image_data, torch.Tensor):
-
-            image_feature_sizes = []
-            for image_embed in image_data:
-                _, image_feature_size, _ = image_embed.shape
-                image_feature_sizes.append(image_feature_size)
-            pixel_values = None
+        dynamic_image_size: Optional[bool] = None,
+        use_thumbnail: Optional[bool] = None,
+    ) -> tuple[int, int]:
+        min_dynamic_patch = self.min_dynamic_patch
+        max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
+                             is None else max_dynamic_patch)
+        dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
+                              is None else dynamic_image_size)
+        use_thumbnail = (self.use_thumbnail
+                         if use_thumbnail is None else use_thumbnail)
+
+        return resolve_h2ovl_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
 
-        else:
-            raise TypeError(f"Invalid image type: {type(image_data)}")
+    def resolve_target_ratios(
+        self,
+        *,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_thumbnail: Optional[bool] = None,
+        prior_aspect_ratio: Optional[tuple[int, int]] = None,
+    ) -> list[tuple[int, int]]:
+        min_num, max_num = self.resolve_min_max_num(
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+        if prior_aspect_ratio:  # hardcoded value for second pass of use_msac
+            min_num = 3
 
-        tokenizer = cached_get_tokenizer(
-            model_config.tokenizer,
-            trust_remote_code=model_config.trust_remote_code,
+        return get_h2ovl_target_ratios(
+            min_num,
+            max_num,
+            prior_aspect_ratio=prior_aspect_ratio,
         )
 
-        prompt = inputs.get("prompt")
-        prompt_token_ids = inputs["prompt_token_ids"]
-        if prompt is None:
-            prompt = tokenizer.decode(prompt_token_ids)
-
-        new_prompt = self._expand_image_prompt(prompt, image_feature_sizes,
-                                               num_patches)
-        new_prompt_token_ids = tokenizer.encode(new_prompt)
-
-        # Wrap image processing in input_processor to avoid duplication
-        image_token_id = tokenizer.encode(
-            self.img_context_token,
-            add_special_tokens=False,
-            return_tensors="pt",
-        )[0]
-
-        # Update multi_modal_data to return
-        if pixel_values is not None:
-            multi_modal_data = {
-                "image": {
-                    "pixel_values": pixel_values,
-                    "image_token_id": image_token_id,
-                }
-            }
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        use_msac: Optional[bool] = None,
+    ) -> int:
+        use_msac = (self.use_msac if use_msac is None else use_msac)
+
+        use_thumbnail = self.use_thumbnail
+
+        if use_msac:
+            target_ratios_1 = self.resolve_target_ratios(
+                use_thumbnail=False,  # Applied in calculate_targets
+            )
+            num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
+                orig_width=image_width,
+                orig_height=image_height,
+                image_size=self.image_size,
+                target_ratios=target_ratios_1,
+                use_thumbnail=True,
+            )
+
+            target_ratios_2 = self.resolve_target_ratios(
+                use_thumbnail=False,  # Applied in calculate_targets
+                prior_aspect_ratio=aspect_ratio_1,
+            )
+            num_patches_2, _, _, _ = calculate_h2ovl_targets(
+                orig_width=image_width,
+                orig_height=image_height,
+                image_size=self.image_size,
+                target_ratios=target_ratios_2,
+                use_thumbnail=True,
+            )
+
+            num_patches = num_patches_1 + num_patches_2 - 1
         else:
-            multi_modal_data = {"image": {"image_embeds": image_data}}
+            target_ratios = self.resolve_target_ratios(
+                use_thumbnail=False,  # Applied in calculate_targets
+            )
+            num_patches, _, _, _ = calculate_h2ovl_targets(
+                orig_width=image_width,
+                orig_height=image_height,
+                image_size=self.image_size,
+                target_ratios=target_ratios,
+                use_thumbnail=use_thumbnail,
+            )
+
+        return num_patches * self.num_image_token
 
-        return token_inputs(
-            prompt=prompt,
-            prompt_token_ids=new_prompt_token_ids,
-            multi_modal_data=multi_modal_data,
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> list[torch.Tensor]:
+        use_msac = self.use_msac if len(images) == 1 else False
+
+        min_num, max_num = self.resolve_min_max_num(
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
         )
 
-    def input_mapper(
+        return [
+            image_to_pixel_values_h2ovl(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+                use_msac=use_msac,
+            ) for image in images
+        ]
+
+
+class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
+
+    def get_hf_processor(
         self,
-        ctx: InputContext,
-        data: object,
         *,
         max_dynamic_patch: Optional[int] = None,
-    ) -> MultiModalKwargs:
+        dynamic_image_size: Optional[bool] = None,
+    ) -> H2OVLProcessor:
+        return H2OVLProcessor(
+            self.get_hf_config(),
+            self.get_tokenizer(),
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        max_tokens_one_image = self.get_max_image_tokens(use_msac=None)
+        if mm_counts.get("image", 0) <= 1:
+            max_tokens_per_image = max_tokens_one_image
+        else:
+            max_tokens_per_image = self.get_max_image_tokens(use_msac=False)
+
+        return {"image": max_tokens_per_image}
 
-        # NOTE: Preprocessing for the image data is done in the
-        # 'input_processor' function during actual inference.
-        if isinstance(data, dict):
-            return MultiModalKwargs(data)
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[H2OVLProcessor],
+        use_msac: Optional[bool] = None,
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return processor.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+            use_msac=use_msac,
+        )
 
-        # The section below is only used with dummy data during
-        # memory profiling.
-        hf_config = ctx.get_hf_config()
+    def get_max_image_tokens(self, use_msac: Optional[bool] = None) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        image_pixel_values_mapper = image_to_pixel_values_wrapper(
-            hf_config, max_dynamic_patch)
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            processor=None,
+            use_msac=use_msac,
+        )
 
-        if isinstance(data, Image.Image):
-            pixel_values = image_pixel_values_mapper(data)
-            pixel_values = pixel_values.unsqueeze(0)
 
-        elif is_list_of(data, Image.Image):
-            hf_config.use_msac = False
-            pixel_values = [image_pixel_values_mapper(img) for img in data]
+class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo]
+                               ):
+
+    def __init__(self,
+                 info: H2OVLProcessingInfo,
+                 dummy_inputs: "BaseDummyInputsBuilder[H2OVLProcessingInfo]",
+                 *,
+                 cache: Optional[ProcessingCache] = None,
+                 enable_sanity_checks: bool = True) -> None:
+        super().__init__(
+            info,
+            dummy_inputs,
+            cache=cache,
+            enable_sanity_checks=enable_sanity_checks,
+        )
+
+        if self.cache is not None:
+            # The processor output depends on the number of images passed,
+            # making it incompatible with processing cache which is supposed
+            # to be invariant of how many images are passed per prompt
+            self.cache = None
+            logger.warning_once(
+                f"{type(self).__name__} does not support processing cache.")
 
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        if "image_num_patches" in out_mm_kwargs:
+            image_num_patches = out_mm_kwargs["image_num_patches"]
+            assert isinstance(image_num_patches, torch.Tensor)
+            image_num_patches = image_num_patches.tolist()
+        elif "image_embeds" in out_mm_kwargs:
+            # TODO: Use image size information in dictionary embedding inputs
+            # to compute num_patches (similar to Qwen2-VL)
+            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
         else:
-            return MultiModalKwargs({"image_embeds": data})
-        model_config = ctx.model_config
-        tokenizer = cached_get_tokenizer(
-            model_config.tokenizer,
-            trust_remote_code=model_config.trust_remote_code,
-        )
-        image_token_id = tokenizer.encode(
-            self.img_context_token,
-            add_special_tokens=False,
-            return_tensors="pt",
-        )[0]
+            image_num_patches = []
+
+        num_images = len(image_num_patches)
 
-        return MultiModalKwargs({
-            "pixel_values": pixel_values,
-            "image_token_id": image_token_id
-        })
+        def get_replacement_internvl(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
 
+            if isinstance(images, ImageEmbeddingItems):
+                feature_size = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                feature_size = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                    use_msac=None if num_images == 1 else False,
+                )
+
+            num_patches = image_num_patches[item_idx]
+            if num_patches is not None:
+                assert isinstance(num_patches, int)
+
+            return PromptReplacementDetails(
+                full=hf_processor.get_image_repl_full(feature_size,
+                                                      num_patches),
+                features=hf_processor.get_image_repl_features(
+                    feature_size, num_patches),
+            )
 
-input_pipeline = H2OVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT)
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement=get_replacement_internvl,
+            )
+        ]
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper)
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data)
-@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor)
+@MULTIMODAL_REGISTRY.register_processor(
+    H2OVLMultiModalProcessor,
+    info=H2OVLProcessingInfo,
+    dummy_inputs=InternVLDummyInputsBuilder)
 class H2OVLChatModel(InternVLChatModel):
 
     def _init_vision_model(
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index c46a867a7..08fc659ab 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -6,35 +6,37 @@
 # Copyright (c) 2023 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
-import re
-from functools import cached_property, partial
+from abc import ABC, abstractmethod
+from functools import cached_property
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+                    TypedDict, TypeVar, Union)
 
 import torch
 import torch.nn as nn
 import torchvision.transforms as T
 from PIL import Image
-from transformers import PretrainedConfig
+from transformers import BatchFeature, PretrainedConfig, TensorType
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.intern_vit import (InternVisionModel,
                                                    InternVisionPatchModel)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptReplacementDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_list_of
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 
-from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
-                   get_clip_num_patches)
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -75,22 +77,27 @@ InternVLImageInputs = Union[InternVLImagePixelInputs,
                             InternVLImageEmbeddingInputs]
 
 
-# copied from https://huggingface.co/OpenGVLab/InternVL2-1B
-def build_transform(input_size):
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def build_transform(input_size: int):
     MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
-    transform = T.Compose([
+    return T.Compose([
         T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
         T.Resize((input_size, input_size),
                  interpolation=T.InterpolationMode.BICUBIC),
         T.ToTensor(),
         T.Normalize(mean=MEAN, std=STD)
     ])
-    return transform
 
 
-# copied from https://huggingface.co/OpenGVLab/InternVL2-1B
-def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
-                              image_size):
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def find_closest_aspect_ratio(
+    aspect_ratio: float,
+    target_ratios: list[tuple[int, int]],
+    *,
+    width: int,
+    height: int,
+    image_size: int,
+) -> tuple[int, int]:
     best_ratio_diff = float('inf')
     best_ratio = (1, 1)
     area = width * height
@@ -106,67 +113,82 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
     return best_ratio
 
 
-def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int,
-                         max_num: int, image_size: int,
-                         use_thumbnail: bool) -> Tuple[int, int, int]:
-    aspect_ratio = orig_width / orig_height
+def resolve_internvl_min_max_num(
+    *,
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
+    use_thumbnail: bool,
+) -> tuple[int, int]:
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
+
+    return min_dynamic_patch, max_dynamic_patch
+
 
-    # calculate the existing image aspect ratio
-    target_ratios = set((i, j) for n in range(min_num, max_num + 1)
-                        for i in range(1, n + 1) for j in range(1, n + 1)
-                        if i * j <= max_num and i * j >= min_num)
-    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+def get_internvl_target_ratios(
+    min_num: int,
+    max_num: int,
+) -> list[tuple[int, int]]:
+    target_ratios = {(i, j)
+                     for n in range(min_num, max_num + 1)
+                     for i in range(1, n + 1)
+                     for j in range(1, n + 1) if min_num <= i * j <= max_num}
+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+
+def calculate_internvl_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int]:
+    aspect_ratio = orig_width / orig_height
 
     # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio,
-                                                    target_ratios, orig_width,
-                                                    orig_height, image_size)
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
 
     # calculate the target width and height
     target_width = image_size * target_aspect_ratio[0]
     target_height = image_size * target_aspect_ratio[1]
     blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-    # add thumbnail image if num_blocks > 1
-    if use_thumbnail and blocks > 1:
-        blocks += 1
-    return blocks, target_width, target_height
-
 
-def calculate_num_blocks_wrapper(
-    hf_config: PretrainedConfig,
-    max_dynamic_patch: Optional[int] = None,
-    dynamic_image_size: Optional[bool] = None,
-):
-    if dynamic_image_size is None:
-        dynamic_image_size = hf_config.dynamic_image_size
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
 
-    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
-    if max_dynamic_patch is None:
-        max_dynamic_patch = hf_config.max_dynamic_patch
-    min_num = hf_config.min_dynamic_patch
-    image_size = hf_config.vision_config.image_size
-    use_thumbnail = hf_config.use_thumbnail
-    return partial(calculate_num_blocks,
-                   min_num=min_num,
-                   max_num=max_dynamic_patch,
-                   image_size=image_size,
-                   use_thumbnail=use_thumbnail)
+    return blocks, target_width, target_height
 
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int,
-                       image_size: int,
-                       use_thumbnail: bool) -> List[Image.Image]:
+def dynamic_preprocess_internvl(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> list[Image.Image]:
     orig_width, orig_height = image.size
 
     # calculate the number of blocks without thumbnail
-    blocks, target_width, target_height = calculate_num_blocks(
-        orig_width,
-        orig_height,
-        min_num,
-        max_num,
-        image_size,
-        use_thumbnail=False)
+    blocks, target_width, target_height = calculate_internvl_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
     # resize the image
     resized_img = image.resize((target_width, target_height))
     processed_images = []
@@ -178,301 +200,463 @@ def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int,
         # split the image
         split_img = resized_img.crop(box)
         processed_images.append(split_img)
+
     assert len(processed_images) == blocks
+
     if use_thumbnail and len(processed_images) != 1:
         thumbnail_img = image.resize((image_size, image_size))
         processed_images.append(thumbnail_img)
+
     return processed_images
 
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-def image_to_pixel_values(image: Image.Image, input_size: int, min_num: int,
-                          max_num: int, use_thumbnail: bool) -> torch.Tensor:
+def image_to_pixel_values_internvl(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    target_ratios = get_internvl_target_ratios(min_num, max_num)
+
     transform = build_transform(input_size=input_size)
-    images = dynamic_preprocess(image,
-                                min_num=min_num,
-                                max_num=max_num,
-                                image_size=input_size,
-                                use_thumbnail=use_thumbnail)
-    pixel_values = [transform(image) for image in images]
-    pixel_values = torch.stack(pixel_values)
+    images = dynamic_preprocess_internvl(
+        image,
+        target_ratios=target_ratios,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
     return pixel_values
 
 
-def image_to_pixel_values_wrapper(
-    hf_config: PretrainedConfig,
-    max_dynamic_patch: Optional[int] = None,
-    dynamic_image_size: Optional[bool] = None,
-):
-    image_size = hf_config.vision_config.image_size
-    min_num = hf_config.min_dynamic_patch
-    if dynamic_image_size is None:
-        dynamic_image_size = hf_config.dynamic_image_size
+class BaseInternVLProcessor(ABC):
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
 
-    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
-    if max_dynamic_patch is None:
-        max_dynamic_patch = hf_config.max_dynamic_patch
-    use_thumbnail = hf_config.use_thumbnail
-    return partial(image_to_pixel_values,
-                   input_size=image_size,
-                   min_num=min_num,
-                   max_num=max_dynamic_patch,
-                   use_thumbnail=use_thumbnail)
-
-
-def get_internvl_num_patches(hf_config: PretrainedConfig):
-    vision_config = hf_config.vision_config
-    downsample_ratio = hf_config.downsample_ratio
-    image_size = vision_config.image_size
-    patch_size = vision_config.patch_size
-    return int(
-        get_clip_num_patches(image_size=image_size, patch_size=patch_size) *
-        (downsample_ratio**2))
-
-
-def get_max_internvl_image_tokens(
-    ctx: InputContext,
-    *,
-    max_dynamic_patch: Optional[int] = None,
-    dynamic_image_size: Optional[bool] = None,
-):
-    hf_config = ctx.get_hf_config()
-    if dynamic_image_size is None:
-        dynamic_image_size = hf_config.dynamic_image_size
+    The code to insert image tokens is based on:
+    https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
+    """
 
-    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
-    if max_dynamic_patch is None:
-        max_dynamic_patch = hf_config.max_dynamic_patch
-    use_thumbnail = hf_config.use_thumbnail
-    if use_thumbnail and max_dynamic_patch > 1:
-        max_dynamic_patch += 1
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+        *,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> None:
+        super().__init__()
 
-    num_patches = get_internvl_num_patches(hf_config)
-    return num_patches * max_dynamic_patch
+        self.config = config
+        self.tokenizer = tokenizer
 
+        image_size: int = config.vision_config.image_size
+        patch_size: int = config.vision_config.patch_size
 
-def get_max_internvl_image_size(
-    ctx: InputContext,
-    *,
-    max_dynamic_patch: Optional[int] = None,
-    dynamic_image_size: Optional[bool] = None,
-):
-    hf_config = ctx.get_hf_config()
-    image_size = hf_config.vision_config.image_size
-    if dynamic_image_size is None:
-        dynamic_image_size = hf_config.dynamic_image_size
+        if dynamic_image_size is None:
+            dynamic_image_size = config.dynamic_image_size
+        assert isinstance(dynamic_image_size, bool)
 
-    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
-    if max_dynamic_patch is None:
-        max_dynamic_patch = hf_config.max_dynamic_patch
-    use_thumbnail = hf_config.use_thumbnail
-    if use_thumbnail and max_dynamic_patch > 1:
-        max_dynamic_patch += 1
-    width = image_size * max_dynamic_patch
-    height = image_size
-    return width, height
+        if max_dynamic_patch is None:
+            max_dynamic_patch = config.max_dynamic_patch
+        assert isinstance(max_dynamic_patch, int)
 
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.image_size = image_size
+        self.min_dynamic_patch: int = config.min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail: bool = config.use_thumbnail
+
+    @property
+    @abstractmethod
+    def image_token_id(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_image_repl_features(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> str:
+        raise NotImplementedError
 
-class InternVLInputPipeline:
+    @abstractmethod
+    def get_image_repl_full(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> str:
+        raise NotImplementedError
 
-    def __init__(
+    def resolve_min_max_num(
         self,
-        img_start_token: str,
-        img_end_token: str,
-        img_context_token: str,
-    ) -> None:
-        super().__init__()
+        *,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_thumbnail: Optional[bool] = None,
+    ) -> tuple[int, int]:
+        min_dynamic_patch = self.min_dynamic_patch
+        max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
+                             is None else max_dynamic_patch)
+        dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
+                              is None else dynamic_image_size)
+        use_thumbnail = (self.use_thumbnail
+                         if use_thumbnail is None else use_thumbnail)
+
+        return resolve_internvl_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
 
-        self.img_start_token = img_start_token
-        self.img_end_token = img_end_token
-        self.img_context_token = img_context_token
+    def resolve_target_ratios(
+        self,
+        *,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_thumbnail: Optional[bool] = None,
+    ) -> list[tuple[int, int]]:
+        min_num, max_num = self.resolve_min_max_num(
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
 
-    def _create_image_prompt(self, feature_size: int, num_patches: int) -> str:
-        return (self.img_start_token + self.img_context_token * feature_size +
-                self.img_end_token)
+        return get_internvl_target_ratios(min_num, max_num)
 
-    def _expand_image_prompt(
+    def get_num_image_tokens(
         self,
-        prompt: str,
-        feature_sizes: List[int],
-        num_patches: int,
-    ) -> str:
-        image_idx = sorted(
-            map(int, re.findall(r"Image-(\d+): <image>\n", prompt)))
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        target_ratios = self.resolve_target_ratios(
+            use_thumbnail=False,  # Applied in calculate_targets
+        )
+
+        num_patches, _, _ = calculate_internvl_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            image_size=self.image_size,
+            target_ratios=target_ratios,
+            use_thumbnail=self.use_thumbnail,
+        )
 
-        new_prompt = prompt
-        for idx, feature_size in enumerate(feature_sizes, start=1):
-            image_prompt = self._create_image_prompt(feature_size, num_patches)
-            if not image_idx:
-                image_prompt = f"Image-{idx}: {image_prompt}"
+        return num_patches * self.num_image_token
 
-            new_prompt = new_prompt.replace('<image>', image_prompt, 1)
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
 
-        return new_prompt
+        return [
+            image_to_pixel_values_internvl(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+            ) for image in images
+        ]
 
-    def input_processor(
+    def __call__(
         self,
-        ctx: InputContext,
-        inputs: DecoderOnlyInputs,
-        *,
+        text: Optional[Union[str, list[str]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
-    ) -> DecoderOnlyInputs:
-        multi_modal_data = inputs.get("multi_modal_data")
-        if multi_modal_data is None or "image" not in multi_modal_data:
-            return inputs
-
-        model_config = ctx.model_config
-        hf_config = ctx.get_hf_config()
-
-        image_data = multi_modal_data["image"]
-        num_patches = get_internvl_num_patches(hf_config)
-        num_blocks_calculator = calculate_num_blocks_wrapper(
-            hf_config, max_dynamic_patch, dynamic_image_size)
-        if isinstance(image_data, Image.Image):
-            width, height = image_data.size
-            num_blocks, _, _ = num_blocks_calculator(width, height)
-            image_feature_sizes = [num_blocks * num_patches]
-        elif is_list_of(image_data, Image.Image):
-            image_feature_sizes = []
-            for image in image_data:
-                width, height = image.size
-                num_blocks, _, _ = num_blocks_calculator(width, height)
-                image_feature_sizes.append(num_blocks * num_patches)
-        elif isinstance(image_data, torch.Tensor):
-            num_images, image_feature_size, hidden_size = image_data.shape
-            image_feature_sizes = [image_feature_size]
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchFeature:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        if len(images) == 0:
+            image_inputs = {}
         else:
-            raise TypeError(f"Invalid image type: {type(image_data)}")
-
-        tokenizer = cached_get_tokenizer(
-            model_config.tokenizer,
-            trust_remote_code=model_config.trust_remote_code)
-
-        prompt = inputs.get("prompt")
-        prompt_token_ids = inputs["prompt_token_ids"]
-        if prompt is None:
-            prompt = tokenizer.decode(prompt_token_ids)
-
-        new_prompt = self._expand_image_prompt(prompt, image_feature_sizes,
-                                               num_patches)
-        new_prompt_token_ids = tokenizer.encode(new_prompt)
-        img_context_token_id = tokenizer.encode(self.img_context_token,
-                                                add_special_tokens=False)
-        assert len(img_context_token_id) == 1, \
-            (f"Invalid image token '{self.img_context_token}': A valid image "
-            f"token encodes to a single token ID, got {img_context_token_id}.")
-        img_context_token_id = img_context_token_id[0]
-
-        # Get precise tracking of placeholder positions
-        token_idx = image_idx = 0
-        placeholder_ranges = []
-        while token_idx < len(new_prompt_token_ids):
-            if new_prompt_token_ids[token_idx] == img_context_token_id:
-                curr_image_featue_size = image_feature_sizes[image_idx]
-                placeholder_ranges.append(
-                    PlaceholderRange(offset=token_idx,
-                                     length=curr_image_featue_size))
-                image_idx += 1
-                token_idx += curr_image_featue_size
-            else:
-                token_idx += 1
+            pixel_values_lst = self._images_to_pixel_values_lst(
+                images,
+                max_dynamic_patch=max_dynamic_patch,
+                dynamic_image_size=dynamic_image_size,
+            )
+            image_inputs = {
+                "pixel_values_flat": torch.cat(pixel_values_lst),
+                "image_num_patches": list(map(len, pixel_values_lst)),
+            }
+
+            for pixel_values in pixel_values_lst:
+                num_patches = pixel_values.shape[0]
+                feature_size = num_patches * self.num_image_token
+
+                image_repl = self.get_image_repl_full(feature_size,
+                                                      num_patches)
+                text = [t.replace('<image>', image_repl, 1) for t in text]
+
+        text_inputs = self.tokenizer(text)
+
+        return BatchFeature(
+            {
+                **text_inputs,
+                **image_inputs,
+            },
+            tensor_type=return_tensors,
+        )
 
-        return token_inputs(
-            prompt=prompt,
-            prompt_token_ids=new_prompt_token_ids,
-            multi_modal_data=multi_modal_data,
-            multi_modal_placeholders={"image": placeholder_ranges})
 
-    def input_mapper(
+class InternVLProcessor(BaseInternVLProcessor):
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
+
+    def get_image_repl_features(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> str:
+        return IMG_CONTEXT * feature_size
+
+    def get_image_repl_full(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> str:
+        features = self.get_image_repl_features(feature_size, num_patches)
+        return IMG_START + features + IMG_END
+
+
+class BaseInternVLProcessingInfo(BaseProcessingInfo):
+
+    @abstractmethod
+    def get_hf_processor(
         self,
-        ctx: InputContext,
-        data: object,
         *,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
-    ):
-        hf_config = ctx.get_hf_config()
-
-        image_pixel_values_mapper = image_to_pixel_values_wrapper(
-            hf_config, max_dynamic_patch, dynamic_image_size)
-        if isinstance(data, Image.Image):
-            data = image_pixel_values_mapper(data)
-            # Add an N dimension for number of images per prompt (currently 1).
-            data = data.unsqueeze(0)
-        elif is_list_of(data, Image.Image):
-            # we can't stack here because images may have different num_patches
-            data = [image_pixel_values_mapper(img) for img in data]
-        else:
-            return MultiModalKwargs({"image_embeds": data})
-        model_config = ctx.model_config
-        tokenizer = cached_get_tokenizer(
-            model_config.tokenizer,
-            trust_remote_code=model_config.trust_remote_code)
-        image_token_id = tokenizer.encode(self.img_context_token,
-                                          add_special_tokens=False,
-                                          return_tensors="pt")[0]
-
-        return MultiModalKwargs({
-            "pixel_values": data,
-            "image_token_id": image_token_id
-        })
-
-    def dummy_data(
+    ) -> BaseInternVLProcessor:
+        raise NotImplementedError
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(
         self,
-        ctx: InputContext,
         seq_len: int,
         mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
+
+    def get_num_image_tokens(
+        self,
         *,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-    ):
-        num_images = mm_counts["image"]
+        image_width: int,
+        image_height: int,
+        processor: Optional[BaseInternVLProcessor],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return processor.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )
 
-        hf_config = ctx.get_hf_config()
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        image_feature_size = get_max_internvl_image_tokens(
-            ctx,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            processor=None,
         )
-        model_config = ctx.model_config
-        tokenizer = cached_get_tokenizer(
-            model_config.tokenizer,
-            trust_remote_code=model_config.trust_remote_code)
-
-        seq_data, ranges = dummy_seq_data_for_clip(
-            hf_config.vision_config,
-            seq_len,
-            num_images,
-            image_token_id=tokenizer.encode(self.img_context_token,
-                                            add_special_tokens=False)[0],
-            image_feature_size_override=image_feature_size,
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_hf_processor()
+
+        base_size = processor.image_size
+        target_ratios = processor.resolve_target_ratios()
+
+        largest_feature_size, largest_feature_pinpoint = 0, None
+        for wr, hr in target_ratios:
+            width, height = base_size * wr, base_size * hr
+
+            feat_size = self.get_num_image_tokens(
+                image_width=width,
+                image_height=height,
+                processor=processor,
+            )
+            if feat_size > largest_feature_size:
+                largest_feature_size = feat_size
+                largest_feature_pinpoint = ImageSize(width=width,
+                                                     height=height)
+
+        if largest_feature_size == 0 or largest_feature_pinpoint is None:
+            raise ValueError("Cannot have a largest feature size of 0!")
+
+        return largest_feature_pinpoint
+
+
+_I = TypeVar("_I", bound=BaseInternVLProcessingInfo)
+
+
+class InternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="<image>" * num_images,
+            mm_data=mm_data,
         )
 
-        max_image_width, max_image_height = get_max_internvl_image_size(
-            ctx,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
+
+class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
         )
 
-        mm_data = dummy_image_for_clip(
-            hf_config.vision_config,
-            num_images,
-            image_width_override=max_image_width,
-            image_height_override=max_image_height,
+        image_token_id = self.info.get_hf_processor(**mm_kwargs).image_token_id
+        image_data = mm_data.get("images", [])
+        assert isinstance(image_data, list)
+
+        # Since there may be extra tokens in the feature placeholders,
+        # we need to pass the image token ID to the model to select the
+        # tokens to merge from the vision encoder outputs
+        processed_outputs["image_token_id"] = [image_token_id
+                                               ] * len(image_data)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
+
+        return dict(
+            pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_num_patches),
+            image_num_patches=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+            image_token_id=MultiModalFieldConfig.batched("image"),
         )
 
-        return DummyData(seq_data, mm_data, ranges)
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        if "image_num_patches" in out_mm_kwargs:
+            image_num_patches = out_mm_kwargs["image_num_patches"]
+            assert isinstance(image_num_patches, torch.Tensor)
+            image_num_patches = image_num_patches.tolist()
+        elif "image_embeds" in out_mm_kwargs:
+            # TODO: Use image size information in dictionary embedding inputs
+            # to compute num_patches (similar to Qwen2-VL)
+            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+        else:
+            image_num_patches = []
+
+        def get_replacement_internvl(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                feature_size = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                feature_size = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            num_patches = image_num_patches[item_idx]
+            if num_patches is not None:
+                assert isinstance(num_patches, int)
+
+            return PromptReplacementDetails(
+                full=hf_processor.get_image_repl_full(feature_size,
+                                                      num_patches),
+                features=hf_processor.get_image_repl_features(
+                    feature_size, num_patches),
+            )
 
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement=get_replacement_internvl,
+            )
+        ]
 
-input_pipeline = InternVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT)
 
+class InternVLProcessingInfo(BaseInternVLProcessingInfo):
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper)
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data)
-@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor)
+    def get_hf_processor(
+        self,
+        *,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> InternVLProcessor:
+        return InternVLProcessor(
+            self.get_hf_config(),
+            self.get_tokenizer(),
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    InternVLMultiModalProcessor,
+    info=InternVLProcessingInfo,
+    dummy_inputs=InternVLDummyInputsBuilder)
 class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
@@ -621,11 +805,11 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[InternVLImageInputs]:
-        pixel_values = kwargs.pop("pixel_values", None)
-        image_token_id = kwargs.pop("image_token_id", None)
+        pixel_values_flat = kwargs.pop("pixel_values_flat", None)
+        image_num_patches = kwargs.pop("image_num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
-        if pixel_values is None and image_embeds is None:
+        if pixel_values_flat is None and image_embeds is None:
             return None
 
         if image_embeds is not None:
@@ -638,31 +822,30 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
                 data=flatten_bn(image_embeds),
             )
 
-        self.img_context_token_id = image_token_id[0]
+        image_token_id = kwargs["image_token_id"]
+        assert isinstance(image_token_id, torch.Tensor)
+        self.img_context_token_id = image_token_id.flatten().unique().item()
 
-        if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
+        if pixel_values_flat is not None:
+            if not isinstance(pixel_values_flat, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            patches_per_image = []
-            for request_pixel_values in pixel_values:
-                for image_pixel_values in request_pixel_values:
-                    patches_per_image.append(image_pixel_values.shape[0])
-            # We need to flatten (B, N, P) to (B*N*P),
-            # so we call flatten_bn twice.
+                                 f"Got type: {type(pixel_values_flat)}")
+
+            assert isinstance(image_num_patches, (torch.Tensor, list))
+
             return InternVLImagePixelInputs(
                 type="pixel_values",
                 data=self._validate_pixel_values(
-                    flatten_bn(flatten_bn(pixel_values), concat=True)),
-                patches_per_image=patches_per_image)
+                    flatten_bn(pixel_values_flat, concat=True)),
+                patches_per_image=flatten_bn(image_num_patches,
+                                             concat=True).tolist())
 
         raise AssertionError("This line should be unreachable.")
 
     def _process_image_input(
         self,
         image_input: InternVLImageInputs,
-    ) -> Tuple[torch.Tensor]:
+    ) -> tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -689,7 +872,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         image_embeds = image_embeds.split(image_feature_sizes)
         return image_embeds
 
-    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
         if self.is_mono:
             self.visual_token_mask = (
                 input_ids == self.img_context_token_id).reshape(-1, 1)
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 19effcbfc..63d308ef6 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -125,7 +125,11 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         return {"image": self.get_max_image_tokens()}
 
     def _apply_feature_select_strategy(
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index d70ae2f14..817edcef4 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -62,7 +62,11 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"video": 1}
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         target_width, target_height = self.get_image_size_with_most_features()
 
         max_video_tokens = self.get_num_video_tokens(
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index f1c06cd85..288942628 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -103,7 +103,11 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         return {
             "image": self.get_max_image_tokens(),
             "video": self.get_max_video_tokens(seq_len),
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index f1c168076..ab697fb8c 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -23,7 +23,6 @@
 # limitations under the License.
 """Inference-only MiniCPM-O model compatible with HuggingFace weights."""
 from functools import partial
-from itertools import accumulate
 from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
                     Tuple, TypedDict, Union)
 
@@ -138,11 +137,15 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None, "audio": None}
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         return {
             "image": self.get_max_image_tokens(),
             "audio": self.get_max_audio_tokens(),
-            "video": self.get_max_video_tokens(seq_len)
+            "video": self.get_max_video_tokens(seq_len),
         }
 
     def get_default_audio_pool_step(self) -> int:
@@ -369,23 +372,18 @@ class MiniCPMOMultiModalProcessor(
         hf_inputs,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
+        audio_num_slices = hf_inputs.get("audio_num_slices", torch.empty(0))
 
-        def get_slices(num_slices: List[int]) -> List[int]:
-            slice_indices = [0] + list(accumulate(num_slices))
-            slices = [(slice_indices[i], slice_indices[i + 1])
-                      for i in range(len(num_slices))]
-            return [slice(*slice_item) for slice_item in slices]
-
-        audio_slices = get_slices(
-            hf_inputs.get("audio_num_slices", torch.empty(0)))
         return dict(
             **super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs),
-            audio_features=MultiModalFieldConfig.flat("audio", audio_slices),
-            audio_feature_lens=MultiModalFieldConfig.flat(
-                "audio", audio_slices),
+            audio_features=MultiModalFieldConfig.flat_from_sizes(
+                "audio", audio_num_slices),
+            audio_feature_lens=MultiModalFieldConfig.flat_from_sizes(
+                "audio", audio_num_slices),
             audio_num_slices=MultiModalFieldConfig.batched("audio"),
             audio_orders_in_mm_data=MultiModalFieldConfig.batched("audio"),
-            audio_embeds=MultiModalFieldConfig.flat("audio", audio_slices))
+            audio_embeds=MultiModalFieldConfig.flat_from_sizes(
+                "audio", audio_num_slices))
 
 
 class MultiModalProjector(nn.Module):
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 6964d6bdc..3d16d635b 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -26,7 +26,6 @@ import math
 import re
 from collections import Counter
 from functools import cached_property, partial
-from itertools import accumulate
 from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
                     Optional, Set, Tuple, TypedDict, Union)
 
@@ -365,7 +364,11 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
         else:
             return {"image": None}
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         mm_max_tokens = {"image": self.get_max_image_tokens()}
         if self.get_model_version() == (2, 6):
             mm_max_tokens["video"] = self.get_max_video_tokens(seq_len)
@@ -761,30 +764,25 @@ class MiniCPMVMultiModalProcessor(
         hf_inputs,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-
-        def get_slices(num_slices: List[int]) -> List[int]:
-            slice_indices = [0] + list(accumulate(num_slices))
-            slices = [(slice_indices[i], slice_indices[i + 1])
-                      for i in range(len(num_slices))]
-            return [slice(*slice_item) for slice_item in slices]
-
-        image_slices = get_slices(
-            hf_inputs.get("image_num_slices", torch.empty(0)))
-        video_slices = get_slices(
-            hf_inputs.get("video_num_slices", torch.empty(0)))
-
-        return dict(
-            pixel_values=MultiModalFieldConfig.flat("image", image_slices),
-            image_sizes=MultiModalFieldConfig.batched("image"),
-            tgt_sizes=MultiModalFieldConfig.flat("image", image_slices),
-            image_num_slices=MultiModalFieldConfig.batched("image"),
-            image_embeds=MultiModalFieldConfig.flat("image", image_slices),
-            video_pixel_values=MultiModalFieldConfig.flat(
-                "video", video_slices),
-            video_image_sizes=MultiModalFieldConfig.batched("video"),
-            video_tgt_sizes=MultiModalFieldConfig.flat("video", video_slices),
-            video_embeds=MultiModalFieldConfig.flat("video", video_slices),
-            video_num_slices=MultiModalFieldConfig.batched("video"))
+        image_num_slices = hf_inputs.get("image_num_slices", torch.empty(0))
+        video_num_slices = hf_inputs.get("video_num_slices", torch.empty(0))
+
+        return dict(pixel_values=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_num_slices),
+                    image_sizes=MultiModalFieldConfig.batched("image"),
+                    tgt_sizes=MultiModalFieldConfig.flat_from_sizes(
+                        "image", image_num_slices),
+                    image_num_slices=MultiModalFieldConfig.batched("image"),
+                    image_embeds=MultiModalFieldConfig.flat_from_sizes(
+                        "image", image_num_slices),
+                    video_pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                        "video", video_num_slices),
+                    video_image_sizes=MultiModalFieldConfig.batched("video"),
+                    video_tgt_sizes=MultiModalFieldConfig.flat_from_sizes(
+                        "video", video_num_slices),
+                    video_embeds=MultiModalFieldConfig.flat_from_sizes(
+                        "video", video_num_slices),
+                    video_num_slices=MultiModalFieldConfig.batched("video"))
 
     def apply(
         self,
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 2aa04bd71..9c674ab46 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -6,44 +6,190 @@
 # Copyright (c) 2024 NVIDIA
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
-from typing import Optional
+from typing import Mapping, Optional
 
+import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
-from vllm.inputs import INPUT_REGISTRY
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (PromptReplacement,
+                                        PromptReplacementDetails)
+from vllm.multimodal.profiling import ProcessorInputs
 
 from .intern_vit import InternVisionModel
-from .internvl import (InternVLChatModel, InternVLInputPipeline,
-                       get_max_internvl_image_tokens)
+from .internvl import (BaseInternVLProcessingInfo, BaseInternVLProcessor,
+                       InternVLChatModel, InternVLDummyInputsBuilder,
+                       InternVLMultiModalProcessor)
 
-IMG_START = '<|vision_start|>'
-IMG_END = '<|vision_end|>'
-IMG_CONTEXT = '<|vision_pad|>'
+IMG_PAD = "<|vision_pad|>"
 
 
-class NVLMInputPipeline(InternVLInputPipeline):
+class NVLMProcessor(BaseInternVLProcessor):
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_PAD]
+
+    def get_image_repl_features(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> str:
+        if num_patches is None:
+            raise NotImplementedError("Embedding inputs are not supported")
+
+        tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
+        if self.use_thumbnail and num_patches != 1:
+            tile_pos_identifiers += ["<tile_global_thumbnail>"]
 
-    def _create_image_prompt(self, feature_size: int, num_patches: int) -> str:
-        tile_pos_identifiers = ([f"<tile_{i}>"
-                                 for i in range(1, num_patches)] +
-                                ["<tile_global_thumbnail>"])
         context_size = feature_size // num_patches
+        features = "".join(identifier + IMG_PAD * context_size
+                           for identifier in tile_pos_identifiers)
+
+        # We include the start and end as well because "<Image><tile" is
+        # tokenized as ["<Image", "><", "tile"], resulting in assertion error
+        # when trying to find "<tile" as a subsequence of "<Image><tile"
+        return "<Image>" + features + "</Image>"
+
+    def get_image_repl_full(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> str:
+        return self.get_image_repl_features(feature_size, num_patches)
+
+
+class NVLMProcessingInfo(BaseInternVLProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> NVLMProcessor:
+        return NVLMProcessor(
+            self.get_hf_config(),
+            self.get_tokenizer(),
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+    def get_max_image_tokens(self) -> int:
+        hf_processor = self.get_hf_processor()
+        tokenizer = hf_processor.tokenizer
+
+        max_num_patches = hf_processor.max_dynamic_patch
+        # we need +1 here because max_dynamic_patch in config doesn't
+        # include the thumbnail patch
+        tile_pos_identifiers = [
+            f"<tile_{i+1}>" for i in range(max_num_patches)
+        ]
+        if hf_processor.use_thumbnail and max_num_patches != 1:
+            tile_pos_identifiers += ["<tile_global_thumbnail>"]
+
+        # "<Image><tile" is tokenized as ["<Image", "><", "tile"]
+        # so we include <tile_1> in the start_str
+        start_str = "<Image>" + tile_pos_identifiers.pop(0)
+        end_str = "</Image>"
+        start_token_len = len(tokenizer.encode(start_str))
+        end_token_len = len(tokenizer.encode(end_str))
+        tile_token_len = sum(
+            len(tokenizer.encode(identifier))
+            for identifier in tile_pos_identifiers)
+        non_image_tokens_num = start_token_len + end_token_len + tile_token_len
+        return super().get_max_image_tokens() + non_image_tokens_num
+
+
+class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            # The newline is necessary to separate ">" of the current item
+            # and "<" of the next item
+            prompt_text="<image>\n" * num_images,
+            mm_data=mm_data,
+        )
+
+
+class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):
 
-        return '<Image>' + ''.join(
-            tile_pos_identifier + self.img_context_token * context_size
-            for tile_pos_identifier in tile_pos_identifiers) + '</Image>'
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        if "image_num_patches" in out_mm_kwargs:
+            image_num_patches = out_mm_kwargs["image_num_patches"]
+            assert isinstance(image_num_patches, torch.Tensor)
+            image_num_patches = image_num_patches.tolist()
+        elif "image_embeds" in out_mm_kwargs:
+            # TODO: Use image size information in dictionary embedding inputs
+            # to compute num_patches (similar to Qwen2-VL)
+            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+        else:
+            image_num_patches = []
+
+        def get_replacement_nvlm(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
 
+            if isinstance(images, ImageEmbeddingItems):
+                feature_size = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                feature_size = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            num_patches = image_num_patches[item_idx]
+            if num_patches is not None:
+                assert isinstance(num_patches, int)
+
+            return PromptReplacementDetails(
+                full=hf_processor.get_image_repl_full(feature_size,
+                                                      num_patches) + "\n",
+                features=hf_processor.get_image_repl_features(
+                    feature_size, num_patches) + "\n",
+            )
 
-input_pipeline = NVLMInputPipeline(IMG_START, IMG_END, IMG_CONTEXT)
+        # See note in dummy data regarding why we have the extra newline
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>\n",
+                replacement=get_replacement_nvlm,
+            )
+        ]
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper)
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data)
-@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor)
+@MULTIMODAL_REGISTRY.register_processor(NVLMMultiModalProcessor,
+                                        info=NVLMProcessingInfo,
+                                        dummy_inputs=NVLMDummyInputsBuilder)
 class NVLM_D_Model(InternVLChatModel):
 
     def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index f089fa5d2..053390c52 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -322,7 +322,11 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         target_width, target_height = self.get_image_size_with_most_features()
 
         max_image_tokens = self.get_num_image_tokens(
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index d7f6662bc..327fad0f5 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -779,7 +779,11 @@ class QWenVLProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         return {"image": self.get_num_image_tokens()}
 
     def get_num_image_tokens(self) -> int:
@@ -799,13 +803,13 @@ class QWenVLDummyInputsBuilder(BaseDummyInputsBuilder[QWenVLProcessingInfo]):
 
         vision_config = hf_config.visual
 
-        max_image_size = vision_config["image_size"]
+        target_width = target_height = vision_config["image_size"]
         num_images = mm_counts.get("image", 0)
 
         mm_data = {
             "image":
-            self._get_dummy_images(width=max_image_size,
-                                   height=max_image_size,
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
                                    num_images=num_images)
         }
 
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index cf104ab00..f09529ca4 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -110,7 +110,11 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         hf_config = self.get_hf_config()
         max_source_positions = hf_config.audio_config.max_source_positions
         max_output_lengths = (max_source_positions - 2) // 2 + 1
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 189ac41e8..2b2638cf6 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -758,7 +758,11 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         return {
             "image": self.get_max_image_tokens(),
             "video": self.get_max_video_tokens(seq_len),
@@ -989,26 +993,21 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
         image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
-        image_slice_idxs = [0] + image_grid_thw.prod(-1).cumsum_(0).tolist()
-        image_slices = [
-            slice(image_slice_idxs[i], image_slice_idxs[i + 1])
-            for i in range(len(image_grid_thw))
-        ]
+        image_grid_sizes = image_grid_thw.prod(-1)
 
         video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
-        video_slice_idxs = [0] + video_grid_thw.prod(-1).cumsum_(0).tolist()
-        video_slices = [
-            slice(video_slice_idxs[i], video_slice_idxs[i + 1])
-            for i in range(len(video_grid_thw))
-        ]
+        video_grid_sizes = video_grid_thw.prod(-1)
 
         return dict(
-            pixel_values=MultiModalFieldConfig.flat("image", image_slices),
-            image_embeds=MultiModalFieldConfig.flat("image", image_slices),
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_grid_sizes),
+            image_embeds=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_grid_sizes),
             image_grid_thw=MultiModalFieldConfig.batched("image"),
-            pixel_values_videos=MultiModalFieldConfig.flat(
-                "video", video_slices),
-            video_embeds=MultiModalFieldConfig.flat("video", video_slices),
+            pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+                "video", video_grid_sizes),
+            video_embeds=MultiModalFieldConfig.flat_from_sizes(
+                "video", video_grid_sizes),
             video_grid_thw=MultiModalFieldConfig.batched("video"),
         )
 
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 5e86b15db..52a4d798f 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -92,7 +92,11 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         feature_extractor = self.get_feature_extractor()
         max_audio_tokens = math.ceil(feature_extractor.chunk_length *
                                      _AUDIO_TOKENS_PER_SECOND)
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index eb52551bb..fe24c7282 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -4,6 +4,7 @@ from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
+from itertools import accumulate
 from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
                     Union, cast, final)
 
@@ -258,6 +259,16 @@ class MultiModalFieldConfig:
             slices=slices,
         )
 
+    @staticmethod
+    def flat_from_sizes(modality: str, size_per_item: torch.Tensor):
+        slice_idxs = [0, *accumulate(size_per_item)]
+        slices = [
+            slice(slice_idxs[i], slice_idxs[i + 1])
+            for i in range(len(size_per_item))
+        ]
+
+        return MultiModalFieldConfig.flat(modality, slices)
+
     def __init__(
         self,
         field_cls: type[BaseMultiModalField],
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 2ad42d1c1..d704fa59b 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -680,7 +680,11 @@ class BaseProcessingInfo:
         raise NotImplementedError
 
     @abstractmethod
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         """
         Get the maximum possible number of tokens per data item
         for each modality.
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 953c01000..5dd754854 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -151,7 +151,8 @@ class MultiModalProfiler(Generic[_I]):
         mm_counts = self.get_mm_limits()
 
         info = self.processing_info
-        mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(seq_len)
+        mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(
+            seq_len, mm_counts)
 
         if mm_counts.keys() != mm_max_tokens_per_item.keys():
             raise AssertionError(
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 29036691b..041411142 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -264,7 +264,9 @@ class MultiModalRegistry:
             )
             processor = self.create_processor(model_config, tokenizer)
             seq_len = model_config.max_model_len
-            return processor.info.get_mm_max_tokens_per_item(seq_len)
+            mm_limits = self.get_mm_limits_per_prompt(model_config)
+            return processor.info.get_mm_max_tokens_per_item(
+                seq_len, mm_limits)
 
         return {
             key: plugin.get_max_multimodal_tokens(model_config)
-- 
GitLab


From 18a88fcccce73261d51a18aba17368236ceb2f8b Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 4 Feb 2025 02:43:58 -0800
Subject: [PATCH 31/65] [V1] Remove scheduling constraint on partial requests
 (#12674)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/core/test_scheduler.py    | 214 +++++++++++++++++++++++++++++
 vllm/v1/core/scheduler.py          | 129 ++++++++---------
 vllm/v1/worker/block_table.py      |   2 +
 vllm/v1/worker/gpu_model_runner.py | 128 ++++++++++-------
 4 files changed, 350 insertions(+), 123 deletions(-)
 create mode 100644 tests/v1/core/test_scheduler.py

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
new file mode 100644
index 000000000..8eb08f3e8
--- /dev/null
+++ b/tests/v1/core/test_scheduler.py
@@ -0,0 +1,214 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Optional
+
+from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.sampling_params import SamplingParams
+from vllm.v1.core.scheduler import Scheduler
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.request import Request, RequestStatus
+
+
+def create_scheduler(
+    model: str = "facebook/opt-125m",
+    max_num_seqs: int = 16,
+    max_num_batched_tokens: int = 8192,
+) -> Scheduler:
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_model_len=max_num_batched_tokens,
+    )
+    model_config = ModelConfig(
+        model=model,
+        task="auto",
+        tokenizer=model,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+    )
+    cache_config = CacheConfig(
+        block_size=16,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+    )
+    cache_config.num_gpu_blocks = 10000
+    return Scheduler(scheduler_config,
+                     model_config,
+                     cache_config,
+                     lora_config=None)
+
+
+def create_requests(
+    num_requests: int,
+    num_tokens: int = 10,
+    mm_positions: Optional[List[PlaceholderRange]] = None,
+):
+    sampling_params = SamplingParams()
+    requests = []
+    for i in range(num_requests):
+        if mm_positions is not None:
+            mm_position = mm_positions[i]
+            mm_inputs = [MultiModalKwargs({})] * len(mm_position)
+        else:
+            mm_position = None
+            mm_inputs = None
+        request = Request(
+            request_id=f"{i}",
+            prompt=None,
+            prompt_token_ids=[i] * num_tokens,
+            sampling_params=sampling_params,
+            multi_modal_inputs=mm_inputs,
+            multi_modal_placeholders=mm_position,
+            multi_modal_hashes=None,
+            eos_token_id=None,
+            arrival_time=0,
+        )
+        requests.append(request)
+    return requests
+
+
+def test_add_requests():
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=10)
+
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        assert request.request_id in scheduler.requests
+        assert len(scheduler.waiting) == i + 1
+
+
+def test_finish_request():
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=10)
+    for request in requests:
+        scheduler.add_request(request)
+
+    for i, request in enumerate(requests):
+        scheduler.finish_requests(request.request_id,
+                                  RequestStatus.FINISHED_ABORTED)
+        assert request.request_id not in scheduler.requests
+        assert len(scheduler.waiting) == 9 - i
+
+
+def test_get_num_unfinished_requests():
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=10)
+    for request in requests:
+        scheduler.add_request(request)
+
+    for i, request in enumerate(requests):
+        scheduler.finish_requests(request.request_id,
+                                  RequestStatus.FINISHED_STOPPED)
+        assert scheduler.get_num_unfinished_requests() == len(requests) - i - 1
+
+
+def test_schedule():
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=10)
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Test initial scheduling
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert len(output.scheduled_cached_reqs) == 0
+    assert len(output.finished_req_ids) == 0
+    # Verify all requests are scheduled.
+    for req_id, num_tokens in output.num_scheduled_tokens.items():
+        assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
+
+    # Verify requests moved from waiting to running
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == len(requests)
+    for i, request in enumerate(requests):
+        assert scheduler.running[i] == request
+
+
+def test_schedule_multimodal_requests():
+    scheduler = create_scheduler(model="llava-hf/llava-1.5-7b-hf")
+    mm_positions = [[PlaceholderRange(offset=i, length=100)]
+                    for i in range(10)]
+    requests = create_requests(
+        num_requests=10,
+        num_tokens=200,
+        mm_positions=mm_positions,
+    )
+    for request in requests:
+        scheduler.add_request(request)
+
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert len(output.scheduled_cached_reqs) == 0
+    assert len(output.finished_req_ids) == 0
+    for req_id, num_tokens in output.num_scheduled_tokens.items():
+        assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
+    assert len(output.scheduled_encoder_inputs) == 10
+    for req_id, encoder_input in output.scheduled_encoder_inputs.items():
+        assert len(encoder_input) == 1
+
+
+def test_schedule_partial_requests():
+    """Test scheduling behavior with partial requests.
+
+    This test verifies that:
+    1. The scheduler can handle multiple partial requests in a single step when
+       constrained by encoder budget.
+    2. A request in RUNNING state may be unscheduled in subsequent steps if
+       there is insufficient encoder budget.
+    """
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_num_batched_tokens=1024,
+    )
+    mm_positions = [[PlaceholderRange(offset=100, length=600)]
+                    for _ in range(3)]
+    requests = create_requests(
+        num_requests=3,
+        num_tokens=800,
+        mm_positions=mm_positions,
+    )
+    for request in requests:
+        scheduler.add_request(request)
+
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 3
+    assert len(output.scheduled_cached_reqs) == 0
+    assert len(output.finished_req_ids) == 0
+
+    assert scheduler.max_num_encoder_input_tokens == 1024
+    # The first request is scheduled fully.
+    assert output.num_scheduled_tokens[requests[0].request_id] == 800
+    # The second request is scheduled partially.
+    # The <img> tokens are not scheduled because of the encoder budget.
+    assert output.num_scheduled_tokens[requests[1].request_id] == 100
+    # The third request is also scheduled partially.
+    # The <img> tokens are not scheduled because of the encoder budget.
+    assert output.num_scheduled_tokens[requests[2].request_id] == 100
+    req_to_index = {
+        request.request_id: i
+        for i, request in enumerate(requests)
+    }
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[0] * len(requests),
+        logprob_token_ids_cpu=None,
+        logprobs_cpu=None,
+    )
+    scheduler.update_from_output(output, model_runner_output)
+
+    # Schedule the next step.
+    # Only the first and second requests are scheduled.
+    # The third request is in the RUNNING state but not scheduled in this step
+    # because of the encoder budget.
+    output = scheduler.schedule()
+    assert len(scheduler.running) == 3
+    assert len(output.scheduled_new_reqs) == 0
+    assert len(output.scheduled_cached_reqs) == 2
+    assert len(output.finished_req_ids) == 0
+    assert output.num_scheduled_tokens[requests[0].request_id] == 1
+    assert output.num_scheduled_tokens[requests[1].request_id] == 700
+    assert requests[2].request_id not in output.num_scheduled_tokens
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index f4738bb33..fb5e83fe0 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -67,10 +67,10 @@ class Scheduler:
         # This is flushed at the end of each scheduling step.
         self.finished_req_ids: Set[str] = set()
 
-        # OPTIMIZATION: Cache the RunningRequestData objects to avoid creating
+        # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
         # them at each scheduling step.
-        # Request id -> RunningRequestData
-        self.running_reqs_data: Dict[str, RunningRequestData] = {}
+        # Request id -> CachedRequestData
+        self._cached_reqs_data: Dict[str, CachedRequestData] = {}
 
         # Encoder-related.
         # Calculate encoder cache size if applicable
@@ -115,17 +115,8 @@ class Scheduler:
         encoder_budget = self.max_num_encoder_input_tokens
 
         # First, schedule the RUNNING requests.
-        # NOTE(woosuk): At most 1 request in the RUNNING queue is allowed to be
-        # in the "partial" state, where the request has some tokens computed
-        # but not all. The constraint is due to the persistent batch in the
-        # V1 model runner.
-        # TODO(woosuk): Remove this constraint after refactoring model runner.
-        has_partial_request = False
         req_index = 0
-        while req_index < len(self.running):
-            # Only the last request in the RUNNING queue can be "partial".
-            assert not has_partial_request
-            assert token_budget > 0
+        while req_index < len(self.running) and token_budget > 0:
             request = self.running[req_index]
             num_new_tokens = request.num_tokens - request.num_computed_tokens
             num_new_tokens = min(num_new_tokens, token_budget)
@@ -137,7 +128,14 @@ class Scheduler:
                                                   request.num_computed_tokens,
                                                   num_new_tokens,
                                                   encoder_budget))
-            assert num_new_tokens > 0
+            if num_new_tokens == 0:
+                # The request cannot be scheduled because the encoder budget
+                # or the encoder cache is exhausted.
+                # NOTE(woosuk): Here, by doing `continue` instead of `break`,
+                # we do not strictly follow the FCFS scheduling policy and
+                # allow the lower-priority requests to be scheduled.
+                req_index += 1
+                continue
 
             while True:
                 new_blocks = self.kv_cache_manager.allocate_slots(
@@ -172,8 +170,6 @@ class Scheduler:
             num_scheduled_tokens[request.request_id] = num_new_tokens
             token_budget -= num_new_tokens
             req_index += 1
-            has_partial_request = (request.num_computed_tokens + num_new_tokens
-                                   < request.num_tokens)
 
             # Encoder-related.
             if encoder_inputs_to_schedule:
@@ -186,13 +182,9 @@ class Scheduler:
 
         # Next, schedule the WAITING requests.
         if not preempted_reqs:
-            while self.waiting:
-                if has_partial_request:
-                    break
+            while self.waiting and token_budget > 0:
                 if len(self.running) == self.max_num_running_reqs:
                     break
-                if token_budget == 0:
-                    break
 
                 request = self.waiting[0]
                 # Get already-cached tokens.
@@ -249,8 +241,6 @@ class Scheduler:
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
-                has_partial_request = (num_computed_tokens + num_new_tokens
-                                       < request.num_tokens)
 
                 # Encoder-related.
                 if encoder_inputs_to_schedule:
@@ -266,8 +256,11 @@ class Scheduler:
         assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
         assert token_budget >= 0
         assert len(self.running) <= self.max_num_running_reqs
+        # Since some requests in the RUNNING queue may not be scheduled in
+        # this step, the total number of scheduled requests can be smaller than
+        # len(self.running).
         assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) +
-                len(scheduled_running_reqs) == len(self.running))
+                len(scheduled_running_reqs) <= len(self.running))
 
         # Get the longest common prefix among all requests in the running queue.
         # This can be potentially used for cascade attention.
@@ -286,25 +279,28 @@ class Scheduler:
             for req in scheduled_new_reqs
         ]
         resumed_reqs_data = [
-            ResumedRequestData.from_request(
-                req, req_to_new_block_ids[req.request_id],
-                req.num_computed_tokens) for req in scheduled_resumed_reqs
+            self._make_cached_request_data(
+                req,
+                req_to_new_block_ids[req.request_id],
+                req.num_computed_tokens,
+                resumed_from_preemption=True,
+            ) for req in scheduled_resumed_reqs
         ]
         running_reqs_data = [
-            self._make_running_request_data(
-                req, req_to_new_block_ids[req.request_id],
-                req.num_computed_tokens) for req in scheduled_running_reqs
+            self._make_cached_request_data(
+                req,
+                req_to_new_block_ids[req.request_id],
+                req.num_computed_tokens,
+                resumed_from_preemption=False,
+            ) for req in scheduled_running_reqs
         ]
-        preempted_req_ids = {req.request_id for req in preempted_reqs}
         scheduler_output = SchedulerOutput(
             scheduled_new_reqs=new_reqs_data,
-            scheduled_resumed_reqs=resumed_reqs_data,
-            scheduled_running_reqs=running_reqs_data,
+            scheduled_cached_reqs=resumed_reqs_data + running_reqs_data,
             num_scheduled_tokens=num_scheduled_tokens,
             total_num_scheduled_tokens=total_num_scheduled_tokens,
             scheduled_encoder_inputs=scheduled_encoder_inputs,
             num_common_prefix_blocks=num_common_prefix_blocks,
-            preempted_req_ids=preempted_req_ids,
             # finished_req_ids is an existing state in the scheduler,
             # instead of being newly scheduled in this step.
             # It contains the request IDs that are finished in between
@@ -316,22 +312,26 @@ class Scheduler:
         self.finished_req_ids = set()
         return scheduler_output
 
-    def _make_running_request_data(
+    def _make_cached_request_data(
         self,
         request: Request,
         new_block_ids: List[int],
         num_computed_tokens: int,
-    ) -> "RunningRequestData":
-        # OPTIMIZATION: Cache the RunningRequestData objects to avoid creating
+        resumed_from_preemption: bool,
+    ) -> "CachedRequestData":
+        # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
         # them at each scheduling step.
-        if request.request_id in self.running_reqs_data:
-            req_data = self.running_reqs_data[request.request_id]
+        if request.request_id in self._cached_reqs_data:
+            req_data = self._cached_reqs_data[request.request_id]
+            req_data.resumed_from_preemption = resumed_from_preemption
             req_data.new_block_ids = new_block_ids
             req_data.num_computed_tokens = num_computed_tokens
         else:
-            req_data = RunningRequestData.from_request(request, new_block_ids,
-                                                       num_computed_tokens)
-            self.running_reqs_data[request.request_id] = req_data
+            req_data = CachedRequestData.from_request(request,
+                                                      resumed_from_preemption,
+                                                      new_block_ids,
+                                                      num_computed_tokens)
+            self._cached_reqs_data[request.request_id] = req_data
         return req_data
 
     def _try_schedule_encoder_inputs(
@@ -420,7 +420,13 @@ class Scheduler:
         # expensive operations inside the loop.
         for request in self.running:
             req_id = request.request_id
-            request.num_computed_tokens += num_scheduled_tokens[req_id]
+            num_tokens_scheduled = num_scheduled_tokens.get(req_id, 0)
+            if num_tokens_scheduled == 0:
+                # The request was not scheduled in this step.
+                new_running.append(request)
+                continue
+
+            request.num_computed_tokens += num_tokens_scheduled
             # When the request's num_computed_tokens catches up its num_tokens,
             # the request generates output tokens. Otherwise, we ignore the
             # sampler output for the request.
@@ -529,7 +535,7 @@ class Scheduler:
         assert request.is_finished()
         self.kv_cache_manager.free(request)
         self.encoder_cache_manager.free(request)
-        self.running_reqs_data.pop(request.request_id, None)
+        self._cached_reqs_data.pop(request.request_id, None)
         del self.requests[request.request_id]
         self.finished_req_ids.add(request.request_id)
 
@@ -584,30 +590,13 @@ class NewRequestData:
 
 
 @dataclass
-class ResumedRequestData:
-
-    req_id: str
-    block_ids: List[int]
-    num_computed_tokens: int
-
-    @classmethod
-    def from_request(
-        cls,
-        request: Request,
-        block_ids: List[int],
-        num_computed_tokens: int,
-    ) -> "ResumedRequestData":
-        return cls(
-            req_id=request.request_id,
-            block_ids=block_ids,
-            num_computed_tokens=num_computed_tokens,
-        )
-
-
-@dataclass
-class RunningRequestData:
+class CachedRequestData:
 
     req_id: str
+    # If resumed_from_preemption is False, new_block_ids will be appended to
+    # the request's block IDs. If True, new_block_ids will be used as the
+    # request's block IDs instead of appending to the existing block IDs.
+    resumed_from_preemption: bool
     new_block_ids: List[int]
     num_computed_tokens: int
 
@@ -615,11 +604,13 @@ class RunningRequestData:
     def from_request(
         cls,
         request: Request,
+        resumed_from_preemption: bool,
         new_block_ids: List[int],
         num_computed_tokens: int,
-    ) -> "RunningRequestData":
+    ) -> "CachedRequestData":
         return cls(
             req_id=request.request_id,
+            resumed_from_preemption=resumed_from_preemption,
             new_block_ids=new_block_ids,
             num_computed_tokens=num_computed_tokens,
         )
@@ -629,14 +620,12 @@ class RunningRequestData:
 class SchedulerOutput:
 
     scheduled_new_reqs: List[NewRequestData]
-    scheduled_resumed_reqs: List[ResumedRequestData]
-    scheduled_running_reqs: List[RunningRequestData]
+    scheduled_cached_reqs: List[CachedRequestData]
 
     num_scheduled_tokens: Dict[str, int]
     total_num_scheduled_tokens: int
     scheduled_encoder_inputs: Dict[str, List[int]]
     num_common_prefix_blocks: int
 
-    preempted_req_ids: Set[str]
     finished_req_ids: Set[str]
     free_encoder_input_ids: List[Tuple[str, int]]
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 8d0785243..f520ee958 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -46,6 +46,8 @@ class BlockTable:
         start: int,
         block_ids: List[int],
     ) -> None:
+        if not block_ids:
+            return
         num_blocks = len(block_ids)
         self.block_table_np[row_idx, start:start + num_blocks] = block_ids
         self.num_blocks_per_row[row_idx] = start + num_blocks
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0b5644525..7841fac1d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -205,12 +205,32 @@ class GPUModelRunner:
                                         pin_memory=self.pin_memory)
         self.seq_lens_np = self.seq_lens_cpu.numpy()
 
-    def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
-        # Remove stopped requests from the cached states.
-        # Keep the states of the preempted requests.
+    def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
+        """Update the cached states and the persistent batch with the scheduler
+        output.
+
+        The updated states are used by the `_prepare_inputs` function to create
+        the input GPU tensors for the model.
+
+        Returns:
+            True if there is a new/resumed/paused/finished request in the batch.
+            If False, we can skip copying SamplingMetadata to the GPU.
+        """
+        # Remove finished requests from the cached states.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
             self.encoder_cache.pop(req_id, None)
+        # Remove the finished requests from the persistent batch.
+        # NOTE(woosuk): There could be an edge case where finished_req_ids and
+        # scheduled_req_ids overlap. This happens when a request is aborted and
+        # then resubmitted with the same ID. In this case, we treat them as two
+        # distinct requests - clearing the cached states for the first request
+        # and handling the second as a new request.
+        removed_req_indices: List[int] = []
+        for req_id in scheduler_output.finished_req_ids:
+            req_index = self.input_batch.remove_request(req_id)
+            if req_index is not None:
+                removed_req_indices.append(req_index)
 
         # Free the cached encoder outputs.
         for req_id, input_id in scheduler_output.free_encoder_input_ids:
@@ -220,36 +240,22 @@ class GPUModelRunner:
                 if not encoder_outputs:
                     self.encoder_cache.pop(req_id, None)
 
-        # Remove the requests from the persistent batch.
-        stopped_req_ids = set().union(
-            scheduler_output.preempted_req_ids,
-            scheduler_output.finished_req_ids,
-        )
-        removed_req_indices: List[int] = []
-        for req_id in stopped_req_ids:
+        # Remove the unscheduled requests from the persistent batch.
+        # NOTE(woosuk): The unscheduled requests are either preempted requests
+        # or running requests that are not scheduled in this step. We remove
+        # them from the persistent batch but keep their cached states since
+        # they will be scheduled again sometime in the future.
+        scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys()
+        cached_req_ids = self.input_batch.req_id_to_index.keys()
+        unscheduled_req_ids = cached_req_ids - scheduled_req_ids
+        # NOTE(woosuk): The persistent batch optimization assumes that
+        # consecutive batches contain mostly the same requests. If batches
+        # have low request overlap (e.g., alternating between two distinct
+        # sets of requests), this optimization becomes very inefficient.
+        for req_id in unscheduled_req_ids:
             req_index = self.input_batch.remove_request(req_id)
-            if req_index is not None:
-                removed_req_indices.append(req_index)
-
-        # Update the states of the running requests.
-        for req_data in scheduler_output.scheduled_running_reqs:
-            req_id = req_data.req_id
-            req_state = self.requests[req_id]
-            req_index = self.input_batch.req_id_to_index[req_id]
-
-            # Update the num_computed_tokens.
-            req_state.num_computed_tokens = req_data.num_computed_tokens
-            self.input_batch.num_computed_tokens_cpu[req_index] = (
-                req_data.num_computed_tokens)
-
-            # Update the block table.
-            num_new_blocks = len(req_data.new_block_ids)
-            if num_new_blocks == 0:
-                continue
-            start_index = len(req_state.block_ids)
-            req_state.block_ids.extend(req_data.new_block_ids)
-            self.input_batch.block_table.append_row(req_index, start_index,
-                                                    req_data.new_block_ids)
+            assert req_index is not None
+            removed_req_indices.append(req_index)
 
         req_ids_to_add: List[str] = []
         # Add new requests to the cached states.
@@ -305,14 +311,36 @@ class GPUModelRunner:
 
             req_ids_to_add.append(req_id)
 
-        # Update the cached states of the resumed requests.
-        for res_req_data in scheduler_output.scheduled_resumed_reqs:
-            req_id = res_req_data.req_id
+        # Update the states of the running/resumed requests.
+        for req_data in scheduler_output.scheduled_cached_reqs:
+            req_id = req_data.req_id
             req_state = self.requests[req_id]
 
-            req_state.block_ids = res_req_data.block_ids
-            req_state.num_computed_tokens = res_req_data.num_computed_tokens
-            req_ids_to_add.append(req_id)
+            # Update the cached states.
+            req_state.num_computed_tokens = req_data.num_computed_tokens
+            if not req_data.resumed_from_preemption:
+                # Append the new blocks to the existing block IDs.
+                req_state.block_ids.extend(req_data.new_block_ids)
+            else:
+                # The request is resumed from preemption.
+                # Replace the existing block IDs with the new ones.
+                req_state.block_ids = req_data.new_block_ids
+
+            req_index = self.input_batch.req_id_to_index.get(req_id)
+            if req_index is None:
+                # The request is not in the persistent batch.
+                # The request was either preempted and resumed later, or was not
+                # scheduled in the previous step and needs to be added again.
+                req_ids_to_add.append(req_id)
+                continue
+
+            # Update the persistent batch.
+            self.input_batch.num_computed_tokens_cpu[req_index] = (
+                req_data.num_computed_tokens)
+            start_index = len(req_state.block_ids) - len(
+                req_data.new_block_ids)
+            self.input_batch.block_table.append_row(req_index, start_index,
+                                                    req_data.new_block_ids)
 
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
@@ -330,6 +358,7 @@ class GPUModelRunner:
         # Condense the batched states if there are empty indices.
         if removed_req_indices:
             self.input_batch.condense(removed_req_indices)
+        return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
 
     def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
@@ -536,10 +565,10 @@ class GPUModelRunner:
             prefix_kv_lens=prefix_kv_lens,
             suffix_kv_lens=suffix_kv_lens,
         )
-        # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
-        # request in the batch. While we should not sample any token from this
-        # partial request, we do so for simplicity. We will ignore the sampled
-        # token from the partial request.
+        # NOTE(woosuk): Due to chunked prefills, the batch may contain partial
+        # requests. While we should not sample any token from these partial
+        # requests, we do so for simplicity. We will ignore the sampled
+        # tokens from the partial requests.
         # TODO: Support prompt logprobs.
         logits_indices = query_start_loc[1:] - 1
         return attn_metadata, logits_indices
@@ -601,22 +630,15 @@ class GPUModelRunner:
 
     def _prepare_sampling(
         self,
-        scheduler_output: "SchedulerOutput",
+        batch_changed: bool,
     ) -> SamplingMetadata:
-        skip_copy = True
-        if (scheduler_output.finished_req_ids
-                or scheduler_output.preempted_req_ids):
-            skip_copy = False
-        if (scheduler_output.scheduled_new_reqs
-                or scheduler_output.scheduled_resumed_reqs):
-            skip_copy = False
         # Create the sampling metadata.
         req_id_output_token_ids: Dict[str, List[int]] = \
             {req_id: req.output_token_ids \
                 for req_id, req in self.requests.items()}
 
         sampling_metadata = self.input_batch.make_sampling_metadata(
-            req_id_output_token_ids, skip_copy)
+            req_id_output_token_ids, skip_copy=not batch_changed)
         return sampling_metadata
 
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
@@ -715,7 +737,7 @@ class GPUModelRunner:
         self,
         scheduler_output: "SchedulerOutput",
     ) -> ModelRunnerOutput:
-        self._update_states(scheduler_output)
+        batch_changed = self._update_states(scheduler_output)
 
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.
@@ -778,7 +800,7 @@ class GPUModelRunner:
         logits = self.model.compute_logits(hidden_states, None)
 
         # Sample the next token and get logprobs if needed.
-        sampling_metadata = self._prepare_sampling(scheduler_output)
+        sampling_metadata = self._prepare_sampling(batch_changed)
         sampler_output = self.model.sample(
             logits=logits,
             sampling_metadata=sampling_metadata,
-- 
GitLab


From 815079de8e9dd984d474f7046412d5aedf4350ff Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 4 Feb 2025 20:00:51 +0800
Subject: [PATCH 32/65] [VLM] merged multimodal processor and V1 support for
 idefics3 (#12660)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/models/supported_models.md        |   2 +-
 .../vision_language/test_models.py            |   4 +-
 .../vision_language/vlm_utils/model_utils.py  |   8 +
 .../multimodal/processing/test_common.py      |   1 +
 .../multimodal/processing/test_idefics3.py    | 179 ++----
 vllm/inputs/registry.py                       |  18 +
 vllm/model_executor/models/idefics3.py        | 560 ++++++++----------
 7 files changed, 315 insertions(+), 457 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index fbdca189a..d8e284292 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -733,7 +733,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `HuggingFaceM4/Idefics3-8B-Llama3` etc.
   * ✅︎
   *
-  *
+  * ✅︎
 - * `InternVLChatModel`
   * InternVL 2.5, Mono-InternVL, InternVL 2.0
   * T + I<sup>E+</sup>
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 7a14ba2f3..5fe46bd75 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -254,14 +254,14 @@ VLM_TEST_SETTINGS = {
         patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
     ),
     "idefics3": VLMTestInfo(
-        models=["HuggingFaceM4/Idefics3-8B-Llama3"],
+        models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<image>",
         max_model_len=8192,
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
-        marks=[large_gpu_mark(min_gb=48)],
+        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
     ),
     "intern_vl": VLMTestInfo(
         models=[
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index d2401b222..ced891e1e 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -192,6 +192,14 @@ def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
     return output_ids, output_str, out_logprobs
 
 
+def idefics3_trunc_hf_output(hf_output: RunnerOutput,
+                             model: str) -> RunnerOutput:
+    output_ids, output_str, out_logprobs = hf_output
+    if output_str.endswith("<end_of_utterance>"):
+        output_str = output_str.split("<end_of_utterance>")[0]
+    return output_ids, output_str, out_logprobs
+
+
 def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
                              model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 07906a71d..5cd749cbd 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -149,6 +149,7 @@ def _test_processing_correctness(
     "adept/fuyu-8b",
     "h2oai/h2ovl-mississippi-800m",
     "OpenGVLab/InternVL2-1B",
+    "HuggingFaceM4/Idefics3-8B-Llama3",
     "llava-hf/llava-1.5-7b-hf",
     "llava-hf/llava-v1.6-mistral-7b-hf",
     "llava-hf/LLaVA-NeXT-Video-7B-hf",
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index 00c1dae51..07ab1bbd4 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -1,13 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 """Tests for Idefics3's multimodal preprocessing kwargs."""
-from typing import Optional
-
 import pytest
-import torch
-from transformers import AutoImageProcessor, AutoTokenizer
+from transformers import Idefics3Config
 
-from vllm.inputs import InputContext, token_inputs
-from vllm.multimodal import MultiModalRegistry
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -15,163 +12,53 @@ from ...utils import build_model_context
 models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
 
 
-# Wrap lazy imports to avoid initializing CUDA during test collection
-@pytest.fixture()
-def input_processor_for_idefics3():
-    from vllm.model_executor.models.idefics3 import (
-        input_processor_for_idefics3)
-    return input_processor_for_idefics3
-
-
-@pytest.fixture()
-def dummy_data_for_idefics3():
-    from vllm.model_executor.models.idefics3 import dummy_data_for_idefics3
-    return dummy_data_for_idefics3
-
-
-@pytest.fixture()
-def get_max_idefics3_image_tokens():
-    from vllm.model_executor.models.idefics3 import (
-        get_max_idefics3_image_tokens)
-    return get_max_idefics3_image_tokens
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("longest_edge", [None, 168, 336, 400, 2 * 336])
-def test_input_mapper_override(model: str, image_assets: _ImageAssets,
-                               longest_edge: Optional[int]):
-    """Ensure that the [default] input mapper handles size properly."""
-
-    mm_processor_kwargs = {
-        "size": {
-            "longest_edge": longest_edge
-        }
-    } if longest_edge is not None else {}
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=mm_processor_kwargs,
-    )
-
-    hf_processor = AutoImageProcessor.from_pretrained(model,
-                                                      trust_remote_code=True,
-                                                      **mm_processor_kwargs)
-
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-
-    image = image_assets[0].pil_image
-    hf_result = hf_processor.preprocess(
-        image,
-        return_tensors="pt",
-    )
-
-    vllm_result = mm_registry.map_input(
-        ctx.model_config,
-        {"image": image},
-    )
-
-    assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("longest_edge, expected_max_tokens", [
-    (None, 2873),
-    (168, 169),
-    (336, 169),
-    (400, 338),
-    (672, 338),
-])
-def test_max_tokens_override(get_max_idefics3_image_tokens, model: str,
-                             longest_edge: Optional[int],
-                             expected_max_tokens: int):
-    """Ensure get_max_idefics3_image_tokens handles mm_processor_kwargs."""
-    size = {"longest_edge": longest_edge} if longest_edge is not None else None
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-
-    actual_max_tokens = get_max_idefics3_image_tokens(
-        ctx=InputContext(ctx.model_config),
-        size=size,
-    )
-
-    assert expected_max_tokens == actual_max_tokens
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("longest_edge, toks_per_img, num_imgs", [
-    (168, 169, 1),
-    (168, 169, 2),
-    (400, 338, 1),
-    (400, 338, 2),
-])
-def test_dummy_data_override(dummy_data_for_idefics3, model: str,
-                             longest_edge: int, toks_per_img: int,
-                             num_imgs: int):
-    """Ensure dummy_data_for_idefics3 handles num_crops properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the dummy data func.
-    size = {"longest_edge": longest_edge} if longest_edge is not None else None
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-
-    dummy_data = dummy_data_for_idefics3(
-        ctx=ctx,
-        seq_len=8192,  # Should be bigger than num_imgs * toks_per_img
-        mm_counts={"image": num_imgs},
-        size=size)
-    sequence_data = dummy_data.seq_data
-    # Ensure we have the right number of placeholders per size
-    image_token_id = ctx.get_hf_config().image_token_id
-    img_tok_count = sequence_data.get_token_ids().count(image_token_id)
-    assert img_tok_count == toks_per_img * num_imgs
-
-
 @pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("longest_edge,expected_toks_per_img,num_imgs", [
-    (336, 169 * (1**2 + 1), 1),
-    (336, 169 * (1**2 + 1), 2),
-    (400, 169 * (2**2 + 1), 1),
-    (400, 169 * (2**2 + 1), 2),
-])
-def test_input_processor_override(input_processor_for_idefics3,
-                                  image_assets: _ImageAssets, model: str,
-                                  longest_edge: int,
-                                  expected_toks_per_img: int, num_imgs: int):
+# yapf: disable
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"size": {"longest_edge": 364}}, 169),
+        ({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
+    ])
+# yapf: enable
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_override(image_assets: _ImageAssets, model: str,
+                            mm_processor_kwargs: dict[str, object],
+                            expected_toks_per_img: int, num_imgs: int):
     """Ensure input_processor_for_idefics3 handles num_crops properly."""
     # Same as the previous test - don't initialize mm_processor_kwargs
     # in this test and assume that the kwargs will be correctly expanded by
     # the partial when calling the custom input processor.
-    size = {"longest_edge": longest_edge} if longest_edge is not None else None
     ctx = build_model_context(
         model_name=model,
         tokenizer_name=model,
         trust_remote_code=True,
         mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
     )
+    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=tokenizer,
+    )
+    hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
 
     # Build the image str / prompt based on the number of images we pass
-    tokenizer = AutoTokenizer.from_pretrained(model)
     placeholders = "<image>" if num_imgs == 1 else "\n".join(
         f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
     prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501
-    images = [image_assets[0].pil_image.resize((336 * 4, 336 * 4))] * num_imgs
-
-    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
-                          prompt=prompt,
-                          multi_modal_data={"image": images})
 
-    processed_inputs = input_processor_for_idefics3(ctx, inputs, size=size)
+    # Build mm_data
+    image_size = ctx.get_hf_config(Idefics3Config).vision_config.image_size
+    dummy_image_size = (image_size * 4, image_size * 4)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+    # Ensure the placeholders format are correct
+    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
+        "input_ids"][0]
 
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = ctx.get_hf_config().image_token_id
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 0ec726b8b..cd4214439 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -31,6 +31,17 @@ C = TypeVar("C", bound=PretrainedConfig, default=PretrainedConfig)
 P = TypeVar("P", bound=ProcessorMixin, default=ProcessorMixin)
 
 
+class HashableDict(dict):
+    """
+    A dictionary that can be hashed by lru_cache.
+    """
+
+    # NOTE: pythonic dict is not hashable,
+    # we override on it directly for simplicity
+    def __hash__(self) -> int:  # type: ignore[override]
+        return hash(frozenset(self.items()))
+
+
 @dataclass(frozen=True)
 class InputContext:
     """
@@ -104,6 +115,13 @@ class InputContext:
         if isinstance(typ, type):
             merged_kwargs["processor_cls"] = typ
 
+        # NOTE: Pythonic dict is not hashable and will raise unhashable type
+        # error when calling `cached_get_processor`, therefore we need to
+        # wrap it to a hashable dict.
+        for key, value in merged_kwargs.items():
+            if isinstance(value, dict):
+                merged_kwargs[key] = HashableDict(value)
+
         hf_processor = cached_get_processor(
             self.model_config.model,
             trust_remote_code=self.model_config.trust_remote_code,
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 9e2e677a6..fdfabbaaf 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -16,35 +16,35 @@
 """Inference-only Idefics3 model compatible with HuggingFace weights."""
 
 import math
-from typing import (Dict, Iterable, List, Literal, Mapping, NamedTuple,
-                    Optional, Set, Tuple, TypedDict, Union)
+from typing import (Dict, Iterable, List, Literal, Mapping, Optional, Set,
+                    Tuple, TypedDict, Union)
 
 import torch
 import torch.utils.checkpoint
-from PIL import Image
 from torch import nn
-# Temporary solution for transformers below 4.46.0.
-from transformers import PretrainedConfig as Idefics3Config
-from transformers import ProcessorMixin as Idefics3ImageProcessor
+from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor,
+                          Idefics3Processor)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.image import cached_get_image_processor
 from vllm.multimodal.inputs import NestedTensors
-from vllm.sequence import IntermediateTensors, SequenceData
-from vllm.transformers_utils.processor import cached_get_processor
-from vllm.utils import is_list_of
+from vllm.multimodal.parse import ImageProcessorItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo,
+                                        MultiModalDataItems,
+                                        MultiModalFieldConfig,
+                                        PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
 
 # yapf: disable
 from .idefics2_vision_model import (
@@ -77,307 +77,253 @@ class Idefics3ImageEmbeddingInputs(TypedDict):
     """
 
 
-class Idefics3ProcessorSize(NamedTuple):
-    """Hashable wrapper for unhashable `size` dict of Idefics3Processor."""
-    # NOTE: cached_get_processor/cached_get_image_processor uses lru_cache,
-    # we need to use NamedTuple instead of TypedDict to avoid hashing issues.
-    longest_edge: int
-
-    def __contains__(self, key: str) -> bool:
-        return key in self._asdict() and getattr(self, key) is not None
-
-    def __getitem__(self, key: str) -> int:
-        return getattr(self, key)
-
-
 ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
 
 
-def get_mm_processor_kwargs(size: Optional[Dict[str, int]] = None) -> Dict:
-    mm_processor_kwargs = {}
-    if size:
-        mm_processor_kwargs["size"] = Idefics3ProcessorSize(**size)
-    return mm_processor_kwargs
-
-
-def input_mapper_for_idefics3(
-    ctx: InputContext,
-    data: object,
-    *,
-    size: Optional[Dict[str, int]] = None,
-):
-    model_config = ctx.model_config
-    mm_processor_kwargs = get_mm_processor_kwargs(size)
-    image_processor = cached_get_image_processor(
-        model_config.model,
-        trust_remote_code=model_config.trust_remote_code,
-        **mm_processor_kwargs)
-    if image_processor is None:
-        raise RuntimeError("No HuggingFace processor is available "
-                           "to process the image object")
-
-    if isinstance(data, Image.Image):
-        images = [[data]]
-    elif is_list_of(data, Image.Image):
-        images = [data]
-    else:
-        raise TypeError(f"Invalid image type: {type(data)}")
-
-    try:
-        batch_data = image_processor(images,
-                                     return_tensors="pt",
-                                     return_row_col_info=True).data
-    except Exception:
-        logger.error("Failed to process image (%s)", data)
-        raise
-
-    return MultiModalKwargs(batch_data)
-
-
-def _resize_output_size(height: int,
-                        width: int,
-                        max_len: Optional[int] = None,
-                        min_len: Optional[int] = 1,
-                        max_size: Optional[int] = None) -> Tuple[int, int]:
-    # Set default value for max_len if not provided
-    max_len = max(height, width) if max_len is None else max_len
-    aspect_ratio = width / height
-
-    # Handle the maximum size constraint
-    if max_size is not None:
-        max_len = min(max_len, max_size)
-
-    # Adjust dimensions according to the aspect ratio
-    if width >= height:
-        width = max_len
-        height = int(width / aspect_ratio)
-    else:
-        height = max_len
-        width = int(height * aspect_ratio)
-
-    # Ensure both width and height are even (if needed)
-    height += 1 if height % 2 != 0 else 0
-    width += 1 if width % 2 != 0 else 0
-
-    # Ensure dimensions are not smaller than the minimum length
-    height = max(height, min_len)
-    width = max(width, min_len)
-
-    return height, width
-
-
-def _get_resize_output_image_size(
-    image_size: Tuple[int, int],
-    resolution_max_side: int,
-    max_image_size: int = 1820,
-) -> Tuple[int, int]:
-    if resolution_max_side > max_image_size:
-        raise ValueError(
-            "`resolution_max_side` cannot be larger than `max_image_size`")
-
-    height, width = image_size
-
-    # Find the output size, when rescaling the longest edge to max_len and
-    # preserving the aspect ratio
-    height, width = _resize_output_size(height,
-                                        width,
-                                        max_len=resolution_max_side)
-
-    return height, width
-
-
-def _prompt_split_image(image_seq_len: int, image_rows: int, image_cols: int,
-                        fake_token_around_image: str, image_token: str,
-                        global_img_token: str) -> str:
-    """
-    Prompt with expanded image tokens for when the image is split 
-    into patches.
-    """
-    text_split_images = ""
-    for n_h in range(image_rows):
-        for n_w in range(image_cols):
-            text_split_images += (fake_token_around_image +
-                                  f"<row_{n_h + 1}_col_{n_w + 1}>" +
-                                  image_token * image_seq_len)
-        text_split_images += "\n"
-
-    text_split_images += "\n" + _prompt_single_image(
-        image_seq_len=image_seq_len,
-        fake_token_around_image=fake_token_around_image,
-        image_token=image_token,
-        global_img_token=global_img_token)
-    return text_split_images
-
-
-def _prompt_single_image(image_seq_len: int, fake_token_around_image: str,
-                         image_token: str, global_img_token: str):
-    """Prompt with expanded image tokens for a single image."""
-    return (fake_token_around_image + global_img_token +
-            image_token * image_seq_len + fake_token_around_image)
-
-
-def _get_image_prompt_string(image_rows: int, image_cols: int,
-                             image_seq_len: int, fake_token_around_image: str,
-                             image_token: str, global_img_token: str):
-    if image_rows == 0 and image_cols == 0:
-        return _prompt_single_image(
-            image_seq_len=image_seq_len,
-            fake_token_around_image=fake_token_around_image,
-            image_token=image_token,
-            global_img_token=global_img_token,
-        )
-    return _prompt_split_image(image_seq_len, image_rows, image_cols,
-                               fake_token_around_image, image_token,
-                               global_img_token)
-
-
-def input_processor_for_idefics3(ctx: InputContext,
-                                 inputs: DecoderOnlyInputs,
-                                 *,
-                                 size: Optional[Dict[str, int]] = None):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    model_config = ctx.model_config
-    mm_processor_kwargs = get_mm_processor_kwargs(size)
-    processor = cached_get_processor(model_config.model, **mm_processor_kwargs)
-    image_processor = processor.image_processor
-    tokenizer = processor.tokenizer
-    size = image_processor.size['longest_edge']
-    max_image_size = image_processor.max_image_size['longest_edge']
-
-    image_data = multi_modal_data["image"]
-    if isinstance(image_data, Image.Image):
-        image_list = [image_data]
-    elif is_list_of(image_data, Image.Image):
-        image_list = image_data
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
-
-    image_rows = []
-    image_cols = []
-    for image in image_list:
-        height, width = _get_resize_output_image_size(image.size, size)
-
-        rows = math.ceil(height / max_image_size)
-        cols = math.ceil(width / max_image_size)
-        image_rows.append(rows)
-        image_cols.append(cols)
-    image_rows = [image_rows]
-    image_cols = [image_cols]
-
-    n_images_in_text = []
-
-    text = inputs.get("prompt")
-    if text is None:
-        prompt_token_ids = inputs.get("prompt_token_ids", [])
-        assert prompt_token_ids
-        text = tokenizer.decode(prompt_token_ids)
-
-    if isinstance(text, str):
-        text = [text]
-    elif not isinstance(text, list) and not isinstance(text[0], str):
-        raise ValueError("Invalid input text. Please provide a string, "
-                         "or a list of strings")
-
-    fake_image_token = processor.fake_image_token.content
-    image_token = processor.image_token.content
-    global_img_token = processor.global_image_tag
-
-    prompt_strings = []
-    for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
-        n_images_in_text.append(sample.count(image_token))
-
-        # Replace the image token with fake tokens around the expanded
-        # image token sequence of length `image_seq_len`
-        image_prompt_strings = []
-        for n_rows, n_cols in zip(sample_rows, sample_cols):
-            image_prompt_string = _get_image_prompt_string(
-                n_rows,
-                n_cols,
-                processor.image_seq_len,
-                image_token=image_token,
-                fake_token_around_image=fake_image_token,
-                global_img_token=global_img_token,
-            )
-            image_prompt_strings.append(image_prompt_string)
-
-        split_sample = sample.split(image_token)
-        if len(split_sample) == 0:
-            raise ValueError("The image token should be present in the text.")
-
-        # Place in the image prompt strings where the image tokens are
-        sample = split_sample[0]
-        for i, image_prompt_string in enumerate(image_prompt_strings):
-            sample += image_prompt_string + split_sample[i + 1]
-        prompt_strings.append(sample)
+class Idefics3ProcessingInfo(BaseProcessingInfo):
 
-    prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids
+    def get_hf_processor(
+            self,
+            *,
+            size: Optional[Dict[str, int]] = None) -> Idefics3Processor:
+        if size is not None:
+            return self.ctx.get_hf_processor(Idefics3Processor, size=size)
 
-    return token_inputs(
-        prompt_token_ids=prompt_token_ids,
-        prompt=prompt_strings[0],
-        multi_modal_data=multi_modal_data,
-    )
+        return self.ctx.get_hf_processor(Idefics3Processor)
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
 
-def _get_max_num_image_patch(image_processor: Idefics3ImageProcessor) -> int:
-    size = image_processor.size['longest_edge']
-    max_image_size = image_processor.max_image_size['longest_edge']
-    resized_height, resized_width = size, size
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        hf_processor = self.get_hf_processor()
+        image_processor: Idefics3ImageProcessor = hf_processor.image_processor
+        grid_w, grid_h = self._get_image_feature_grid_size(
+            image_width=image_processor.size['longest_edge'],
+            image_height=image_processor.size['longest_edge'],
+        )
+        num_image_token = (grid_w * grid_h + 1) * hf_processor.image_seq_len
+        # Calculate Non-image-token length
+        # NOTE: <row_1_col_1> and <global-img> are special token for SmolVLM
+        # but not for Idefic3, so we need to tokenize them to get actual length.
+        tokenizer = self.get_tokenizer()
+        tile_token_len = len(tokenizer.tokenize("<row_1_col_1>"))
+        glob_token_len = len(tokenizer.tokenize(hf_processor.global_image_tag))
+        # linebreak and <fake_token_around_image> always cost 1 token
+        fake_token_len = lb_len = 1
+        non_image_token = (grid_w * grid_h) * (
+            tile_token_len + fake_token_len) + glob_token_len + (
+                grid_h + 1) * lb_len + fake_token_len
+        return {"image": num_image_token + non_image_token}
+
+    def _resize_output_size(self,
+                            *,
+                            height: int,
+                            width: int,
+                            max_len: Optional[int] = None,
+                            min_len: Optional[int] = 1,
+                            max_size: Optional[int] = None) -> tuple[int, int]:
+        # Set default value for max_len if not provided
+        max_len = max(height, width) if max_len is None else max_len
+        aspect_ratio = width / height
+
+        # Handle the maximum size constraint
+        if max_size is not None:
+            max_len = min(max_len, max_size)
+
+        # Adjust dimensions according to the aspect ratio
+        if width >= height:
+            width = max_len
+            height = int(width / aspect_ratio)
+        else:
+            height = max_len
+            width = int(height * aspect_ratio)
 
-    grid_h = resized_height // max_image_size
-    grid_w = resized_width // max_image_size
-    return (grid_h * grid_w + 1)
+        # Ensure both width and height are even (if needed)
+        height += height % 2
+        width += width % 2
 
+        # Ensure dimensions are not smaller than the minimum length
+        height = max(height, min_len)
+        width = max(width, min_len)
 
-def get_max_idefics3_image_tokens(ctx: InputContext,
-                                  *,
-                                  size: Optional[Dict[str,
-                                                      int]] = None) -> int:
-    model_config = ctx.model_config
-    mm_processor_kwargs = get_mm_processor_kwargs(size)
-    processor = cached_get_processor(model_config.model, **mm_processor_kwargs)
-    image_seq_len = processor.image_seq_len
-    image_processor = processor.image_processor
+        return height, width
 
-    max_num_image_patches = _get_max_num_image_patch(image_processor)
+    def _get_resize_output_image_size(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        resolution_max_side: int,
+    ) -> tuple[int, int]:
+        hf_processor = self.get_hf_processor()
+        image_processor: Idefics3ImageProcessor = hf_processor.image_processor
+        max_image_size = image_processor.size['longest_edge']
+        if resolution_max_side > max_image_size:
+            raise ValueError(
+                "`resolution_max_side` cannot be larger than `max_image_size`")
+
+        height, width = image_height, image_width
+
+        # Find the output size, when rescaling the longest edge to max_len and
+        # preserving the aspect ratio
+        height, width = self._resize_output_size(height=height,
+                                                 width=width,
+                                                 max_len=resolution_max_side)
+        return height, width
+
+    def _get_image_feature_grid_size(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        size: Optional[dict[str, object]] = None,
+    ) -> tuple[int, int]:
+        hf_processor = self.get_hf_processor(size=size)
+        image_processor: Idefics3ImageProcessor = hf_processor.image_processor
+        max_image_size = image_processor.max_image_size['longest_edge']
+        size = image_processor.size['longest_edge']
+        assert size % max_image_size == 0, (
+            "`longest_edge` in image_processor's `size` must be divisible by "
+            "`longest_edge` in `max_image_size`, this may be caused by "
+            "incorrect mm_kwargs override.")
+
+        resized_height, resized_width = self._get_resize_output_image_size(
+            image_width=image_width,
+            image_height=image_height,
+            resolution_max_side=size,
+        )
+        if resized_height > max_image_size or resized_width > max_image_size:
+            grid_h = math.ceil(resized_height / max_image_size)
+            grid_w = math.ceil(resized_width / max_image_size)
+        else:
+            grid_h = grid_w = 0
+        return grid_w, grid_h
 
-    return max_num_image_patches * image_seq_len
 
+class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
+                                 ):
 
-def dummy_data_for_idefics3(
-        ctx: InputContext,
+    def get_dummy_processor_inputs(
+        self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        *,
-        size: Optional[Dict[str, int]] = None) -> DummyData:
-    hf_config = ctx.get_hf_config()
-    num_images = mm_counts["image"]
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+        hf_processor = self.info.get_hf_processor()
+        image_processor: Idefics3ImageProcessor = hf_processor.image_processor
+        longest_edge = image_processor.max_image_size['longest_edge']
+        image_token: str = hf_processor.image_token.content
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=longest_edge,
+                                   height=longest_edge,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=mm_data,
+        )
 
-    mm_processor_kwargs = get_mm_processor_kwargs(size)
-    processor = cached_get_processor(ctx.model_config.model,
-                                     **mm_processor_kwargs)
-    max_num_image_patches = _get_max_num_image_patch(processor.image_processor)
-    image_seq_len = processor.image_seq_len
-    max_llm_image_tokens = max_num_image_patches * image_seq_len * num_images
 
-    if seq_len - max_llm_image_tokens < 0:
-        raise RuntimeError(
-            f"Idefics3 cannot process {num_images} images in a prompt, "
-            "please increase max_model_len or reduce image limit by "
-            "--limit-mm-per-prompt.")
+class Idefics3MultimodalProcessor(
+        BaseMultiModalProcessor[Idefics3ProcessingInfo]):
 
-    seq_data = SequenceData.from_prompt_token_counts(
-        (hf_config.image_token_id, max_llm_image_tokens),
-        (0, seq_len - max_llm_image_tokens))
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if mm_data:
+            processed_outputs = super()._call_hf_processor(
+                prompt, mm_data, mm_kwargs)
+            image_grids = [
+                self.info._get_image_feature_grid_size(
+                    image_width=img.width,
+                    image_height=img.height,
+                    **mm_kwargs,
+                ) for img in mm_data["images"]
+            ]
+            image_patches = list(map(lambda x: math.prod(x) + 1, image_grids))
+            for key in ("pixel_values", "pixel_attention_mask"):
+                data = processed_outputs.pop(key)
+                data = data.flatten(0, 1).split(image_patches)
+                processed_outputs[key] = data
+        else:
+            tokenizer = self.info.get_tokenizer()
+            processed_outputs = tokenizer(prompt,
+                                          add_special_tokens=True,
+                                          return_tensors="pt")
+        return processed_outputs
 
-    width = height = hf_config.vision_config.image_size
-    image = Image.new("RGB", (width, height), color=0)
-    mm_data = {"image": [image] if num_images == 1 else [image] * num_images}
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            pixel_attention_mask=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
 
-    return DummyData(seq_data, mm_data)
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        image_token = hf_processor.image_token.content
+        fake_image_token = hf_processor.fake_image_token.content
+        global_img_token = hf_processor.global_image_tag
+        image_seq_len = hf_processor.image_seq_len
+        grid_placeholder = "<row_{n_h}_col_{n_w}>"
+
+        p_img = image_token * image_seq_len
+        global_img_placeholder = fake_image_token + global_img_token + p_img
+        tile_img_placeholder = fake_image_token + grid_placeholder + p_img
+
+        def get_replacement_idefics3(item_idx: int) -> str:
+            images = mm_items.get_items("image", ImageProcessorItems)
+
+            image_size = images.get_image_size(item_idx)
+            grid_w, grid_h = self.info._get_image_feature_grid_size(
+                image_width=image_size.width,
+                image_height=image_size.height,
+                **hf_processor_mm_kwargs,
+            )
+            if grid_w == 0 and grid_h == 0:
+                image_placeholder = global_img_placeholder
+            else:
+                tiles_placeholder = list[str]()
+                for i in range(grid_h):
+                    for j in range(grid_w):
+                        placeholder_per_tile = tile_img_placeholder.format(
+                            n_h=i + 1, n_w=j + 1)
+                        tiles_placeholder.append(placeholder_per_tile)
+                        # Add line break if it is the last tile in the row
+                        if j == grid_w - 1:
+                            tiles_placeholder.append("\n")
+
+                image_placeholder = "".join(
+                    [*tiles_placeholder, "\n", global_img_placeholder])
+            return image_placeholder + fake_image_token
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_replacement_idefics3,
+            )
+        ]
 
 
 class Idefics3SimpleMLP(nn.Module):
@@ -453,7 +399,7 @@ class Idefics3Model(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
-        config = vllm_config.model_config.hf_config
+        config: Idefics3Config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
 
         self.config = config
@@ -541,15 +487,13 @@ class Idefics3Model(nn.Module):
         self,
         pixel_values: torch.Tensor,
         pixel_attention_mask: Optional[torch.BoolTensor] = None,
-    ) -> torch.Tensor:
+    ) -> NestedTensors:
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        batch_size, num_images, num_channels, height, width = pixel_values.shape
+        num_patches = [x.size(0) for x in pixel_values]
         pixel_values = pixel_values.to(
             dtype=self.vision_model.embeddings.patch_embedding.weight.dtype
         )  # fp16 compatibility
-        pixel_values = pixel_values.view(batch_size * num_images,
-                                         *pixel_values.shape[2:])
 
         # Remove padding images - padding images are full 0.
         nb_values_per_image = pixel_values.shape[1:].numel()
@@ -567,8 +511,6 @@ class Idefics3Model(nn.Module):
             )
         else:
             # Remove padding images from the mask
-            pixel_attention_mask = pixel_attention_mask.view(
-                batch_size * num_images, *pixel_attention_mask.shape[2:])
             pixel_attention_mask = pixel_attention_mask[
                 real_images_inds].contiguous()
 
@@ -587,10 +529,10 @@ class Idefics3Model(nn.Module):
             patch_attention_mask=patch_attention_mask,
         )
 
-        return image_hidden_states
+        return image_hidden_states.split(num_patches)
 
     def _process_image_pixels(
-            self, inputs: Idefics3ImagePixelInputs) -> torch.Tensor:
+            self, inputs: Idefics3ImagePixelInputs) -> NestedTensors:
         assert self.vision_model is not None
 
         pixel_values = inputs["data"]
@@ -605,7 +547,9 @@ class Idefics3Model(nn.Module):
 
         assert self.vision_model is not None
         image_features = self._process_image_pixels(image_input)
-        return self.connector(image_features)
+        num_patches = [x.size(0) for x in image_features]
+        image_features = torch.cat(image_features)
+        return self.connector(image_features).split(num_patches)
 
     def get_input_embeddings(
         self,
@@ -634,10 +578,10 @@ class Idefics3Model(nn.Module):
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_idefics3)
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_idefics3_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_idefics3)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_idefics3)
+@MULTIMODAL_REGISTRY.register_processor(
+    Idefics3MultimodalProcessor,
+    info=Idefics3ProcessingInfo,
+    dummy_inputs=Idefics3DummyInputsBuilder)
 class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
                                        SupportsLoRA):
     packed_modules_mapping = {
@@ -689,7 +633,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         if self.config.text_config.tie_word_embeddings:
             self.lm_head.weight = self.model.text_model.wte.weight
         self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self.model._parse_and_validate_image_input(**kwargs)
-- 
GitLab


From 6469038b149425e25b77c1ef93adf0e7712fd100 Mon Sep 17 00:00:00 2001
From: Michael Greenbaum <48786769+mgtk77@users.noreply.github.com>
Date: Tue, 4 Feb 2025 14:58:48 +0200
Subject: [PATCH 33/65] [Bugfix] Fix loading of fine-tuned models based on
 Phi-3-Small (#12689)

Signed-off-by: Michael Greenbaum <mgreenbaum@microsoft.com>
Co-authored-by: Michael Greenbaum <mgreenbaum@microsoft.com>
---
 vllm/model_executor/models/phi3_small.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index a8b7e9b2a..873e9d377 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -476,6 +476,8 @@ class Phi3SmallForCausalLM(nn.Module, SupportsPP):
                 continue
             if is_pp_missing_parameter(name, self):
                 continue
+            if "lm_head.weight" in name and self.config.tie_word_embeddings:
+                continue
             param = params_dict[name]
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
-- 
GitLab


From 62467a834a94566e2d81a276817a20174b474151 Mon Sep 17 00:00:00 2001
From: Kero Liang <kerorek@outlook.com>
Date: Tue, 4 Feb 2025 21:03:19 +0800
Subject: [PATCH 34/65] Avoid unnecessary multi-modal input data copy when
 len(batch) == 1 (#12722)

Signed-off-by: imkero <kerorek@outlook.com>
---
 vllm/multimodal/inputs.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index fe24c7282..8e4af7f88 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -212,6 +212,11 @@ class MultiModalBatchedField(BaseMultiModalField):
 
     def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
+            if len(batch) == 1:
+                # An optimization when `batch` contains only one tensor:
+                # - produce exactly same result as `torch.stack(batch)`
+                # - will achieve zero-copy if the tensor is contiguous
+                return batch[0].unsqueeze(0).contiguous()
             first_shape = batch[0].shape
             if all(elem.shape == first_shape for elem in batch):
                 return torch.stack(batch)
@@ -235,6 +240,11 @@ class MultiModalFlatField(BaseMultiModalField):
 
     def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
+            if len(batch) == 1:
+                # An optimization when `batch` contains only one tensor:
+                # - produce exactly same result as `torch.concat(batch)`
+                # - will achieve zero-copy if the tensor is contiguous
+                return batch[0].contiguous()
             first_shape = batch[0].shape
             if all(elem.shape[1:] == first_shape[1:] for elem in batch):
                 return torch.concat(batch)
@@ -407,6 +417,12 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
             return stacked
 
         tensors_ = cast(list[torch.Tensor], stacked)
+        if len(tensors_) == 1:
+            # An optimization when `tensors_` contains only one tensor:
+            # - produce exactly same result as `torch.stack(tensors_)`
+            # - will achieve zero-copy if the tensor is contiguous
+            return tensors_[0].unsqueeze(0).contiguous()
+
         if any(t.shape != tensors_[0].shape for t in tensors_):
             # The tensors have incompatible shapes and can't be stacked.
             return tensors_
-- 
GitLab


From 649550f27e4ad35e7e3352800438991fdaf150a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophie=20du=20Cou=C3=A9dic?=
 <sophie.du.couedic.de.kergoualer@ibm.com>
Date: Tue, 4 Feb 2025 14:19:12 +0100
Subject: [PATCH 35/65] [Build] update requirements of no-device for plugin
 usage (#12630)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Sophie du Couédic <sop@zurich.ibm.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 50265d46e..a4043c43a 100755
--- a/setup.py
+++ b/setup.py
@@ -556,7 +556,7 @@ def get_requirements() -> List[str]:
         return resolved_requirements
 
     if _no_device():
-        requirements = _read_requirements("requirements-cpu.txt")
+        requirements = _read_requirements("requirements-common.txt")
     elif _is_cuda():
         requirements = _read_requirements("requirements-cuda.txt")
         cuda_major, cuda_minor = torch.version.cuda.split(".")
-- 
GitLab


From 18016a5e627d2a4b69af599272a5aa8ce71b98c8 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 4 Feb 2025 23:54:23 +0800
Subject: [PATCH 36/65] [Bugfix] Fix CI failures for InternVL and Mantis models
 (#12728)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../vision_language/test_models.py            |  17 +-
 tests/models/registry.py                      |   3 +-
 tests/multimodal/test_processing.py           |  69 +++
 tests/multimodal/test_processor_kwargs.py     | 402 ------------------
 4 files changed, 79 insertions(+), 412 deletions(-)
 delete mode 100644 tests/multimodal/test_processor_kwargs.py

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 5fe46bd75..85bc4ac13 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -9,6 +9,7 @@ from pathlib import PosixPath
 from typing import Type
 
 import pytest
+from packaging.version import Version
 from transformers import AutoModelForVision2Seq
 from transformers import __version__ as TRANSFORMERS_VERSION
 
@@ -154,13 +155,7 @@ VLM_TEST_SETTINGS = {
         stop_str=["<|im_end|>"],
         image_size_factors=[(0.10, 0.15)],
         max_tokens=64,
-        marks=[
-            pytest.mark.skipif(
-                TRANSFORMERS_VERSION < "4.48.0",
-                reason="HF model requires transformers>=4.48.0",
-            ),
-            large_gpu_mark(min_gb=64),
-        ],
+        marks=[large_gpu_mark(min_gb=64)],
     ),
     "blip2": VLMTestInfo(
         models=["Salesforce/blip2-opt-2.7b"],
@@ -206,7 +201,7 @@ VLM_TEST_SETTINGS = {
         image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
         marks=[
             pytest.mark.skipif(
-                TRANSFORMERS_VERSION >= "4.48.0",
+                Version(TRANSFORMERS_VERSION) >= Version("4.48"),
                 reason="HF model is not compatible with transformers>=4.48.0",
             )
         ],
@@ -339,6 +334,12 @@ VLM_TEST_SETTINGS = {
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
         patch_hf_runner=model_utils.mantis_patch_hf_runner,
+        marks=[
+            pytest.mark.skipif(
+                Version(TRANSFORMERS_VERSION) >= Version("4.48"),
+                reason="HF model is not compatible with transformers>=4.48.0",
+            )
+        ],
     ),
     "minicpmv_25": VLMTestInfo(
         models=["openbmb/MiniCPM-Llama3-V-2_5"],
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 7b5032f79..285fbe484 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -224,8 +224,7 @@ _CROSS_ENCODER_EXAMPLE_MODELS = {
 
 _MULTIMODAL_EXAMPLE_MODELS = {
     # [Decoder-only]
-    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria",
-                                                    min_transformers_version="4.48"),
+    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
     "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"),  # noqa: E501
     "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
     "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b",
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 6cccd2aa2..459c0d9d1 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from contextlib import nullcontext
+from types import MethodType
 from typing import cast
 from unittest.mock import MagicMock
 
 import numpy as np
 import pytest
+from transformers import ProcessorMixin
 
 from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -636,3 +638,70 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
             mm_data=mm_data,
             hf_processor_mm_kwargs={},
         )
+
+
+class _ProcessorProxy:
+
+    def __init__(self, processor: ProcessorMixin) -> None:
+        super().__init__()
+
+        self.__processor = processor
+
+    def __getattr__(self, key: str):
+        return getattr(self.__processor, key)
+
+    def __call__(
+        self,
+        text=None,
+        images=None,
+        videos=None,
+        exists=None,
+        return_tensors=None,
+    ):
+        return dict(exists=exists)
+
+
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-7B-Instruct"])  # Dummy
+# yapf: disable
+@pytest.mark.parametrize(
+    ("call_kwargs", "expected_kwargs"),
+    [
+        # Should ignore invalid kwargs
+        ({"does_not_exist": 100}, {"exists": None}),
+        ({"exists": 1}, {"exists": 1}),
+        ({"does_not_exist": 100, "exists": 1}, {"exists": 1}),
+    ],
+)
+# yapf: enable
+def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
+    model_config = ModelConfig(
+        model=model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="half",
+        revision=None,
+    )
+
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        model_config,
+        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+    )
+    orig_get_hf_processor = processor.info.get_hf_processor
+
+    def get_hf_processor(self, **kwargs):
+        assert kwargs == call_kwargs
+        return _ProcessorProxy(orig_get_hf_processor())
+
+    processor.info.get_hf_processor = MethodType(get_hf_processor,
+                                                 processor.info)
+
+    out_kwargs = processor._call_hf_processor(
+        prompt="",
+        mm_data={},
+        mm_kwargs=call_kwargs,
+    )
+
+    assert out_kwargs == expected_kwargs
diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py
deleted file mode 100644
index 5d18b2ed7..000000000
--- a/tests/multimodal/test_processor_kwargs.py
+++ /dev/null
@@ -1,402 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from array import array
-from typing import Callable, Dict, Mapping, Optional
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from vllm.inputs import (DecoderOnlyInputs, DummyData, InputContext,
-                         InputRegistry, ProcessorInputs, token_inputs)
-from vllm.multimodal import MultiModalRegistry
-from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
-
-from ..models.utils import build_model_context
-
-# Used for fast tests where the model doesn't matter
-DUMMY_MODEL_ID = "facebook/opt-125m"
-# Used for tests that need a multimodal model
-MULTIMODAL_MODEL_ID = "OpenGVLab/InternVL2-2B"
-
-# For mm_processor_kwargs - we test overrides by defining mocks for each place
-# it is used, and ensuring that we can pass processor kwargs an override value
-# to receive the intended result for things like sequence length etc.
-DEFAULT_MAX_DYNAMIC_PATCH = 6
-MAX_DYNAMIC_PATCH_OVERRIDE = 4
-
-
-# Mocks for all of the places that we use the mm_processor_kwargs
-# to override values in different callables
-@pytest.fixture
-def use_processor_mock():
-    """Patches the internal model input processor with an override callable."""
-
-    def custom_processor(ctx: InputContext,
-                         inputs: DecoderOnlyInputs,
-                         *,
-                         max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH):
-        # For testing purposes, we don't worry about the prompt
-        return token_inputs(
-            prompt_token_ids=[],
-            mm_processor_kwargs={"max_dynamic_patch": max_dynamic_patch})
-
-    with patch("vllm.inputs.registry.InputRegistry._get_model_input_processor",
-               return_value=custom_processor):
-        yield
-
-
-@pytest.fixture
-def use_dummy_data_mock():
-    """Patches the internal model input processor with an override callable."""
-
-    def custom_dummy_data_factory(self,
-                                  ctx: InputContext,
-                                  seq_len: int,
-                                  mm_counts: Mapping[str, int],
-                                  *,
-                                  max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH):
-        seq_data = SequenceData(
-            array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * max_dynamic_patch))
-        return DummyData(seq_data, None)
-
-    with patch(
-            "vllm.inputs.registry.InputRegistry._default_dummy_data_factory",
-            custom_dummy_data_factory):
-        yield
-
-
-# Lazy import to avoid CUDA reinitialization error
-def mm_model_cls():
-    from vllm.model_executor.models.internvl import InternVLChatModel
-
-    return InternVLChatModel
-
-
-# lambda whose signature matches max token calcs extra & mapper + extra kwargs
-get_max_dynamic_patch = lambda ctx, *, max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH: max_dynamic_patch  # noqa: E501
-custom_mapper = lambda ctx, data, *, max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH: {  # noqa: E501
-    "pixel_values": torch.zeros(size=(1, max_dynamic_patch + 1, 3, 448, 448))
-}
-
-
-### Tests for default processor logic & mm_processor_kwargs wrapping
-def test_default_processor_is_a_noop():
-    """Ensure that by default, there is no processor override."""
-    dummy_registry = InputRegistry()
-    ctx = build_model_context(DUMMY_MODEL_ID)
-    processor = dummy_registry.create_input_processor(ctx.model_config)
-    proc_inputs = token_inputs(prompt_token_ids=[], prompt="")
-    proc_outputs = processor(inputs=proc_inputs)
-    assert proc_inputs is proc_outputs
-
-
-def _get_max_dynamic_patch_info(init_max_dynamic_patch: int,
-                                inference_max_dynamic_patch: int):
-    """Get the init / inference kwargs and expected max_dynamic_patch."""
-    # If we have a value for max_dynamic_patch, pass the override value and make
-    # sure we get that value as a return-value from out mock processor,
-    # otherwise fall back to the default value
-    init_kwargs = None if init_max_dynamic_patch is None else {
-        "max_dynamic_patch": init_max_dynamic_patch
-    }
-    inference_kwargs = None if inference_max_dynamic_patch is None else {
-        "max_dynamic_patch": inference_max_dynamic_patch
-    }
-    if inference_max_dynamic_patch is not None:
-        expected_seq_count = inference_max_dynamic_patch
-    elif init_max_dynamic_patch is not None:
-        expected_seq_count = init_max_dynamic_patch
-    else:
-        expected_seq_count = DEFAULT_MAX_DYNAMIC_PATCH
-    return init_kwargs, inference_kwargs, expected_seq_count
-
-
-def _get_processed_max_dynamic_patch(
-    processor: Callable[[ProcessorInputs], ProcessorInputs],
-    inference_kwargs: Optional[Dict[str, int]],
-) -> int:
-    processed_inputs = processor(
-        token_inputs(prompt_token_ids=[],
-                     prompt="",
-                     mm_processor_kwargs=inference_kwargs))
-
-    assert "type" in processed_inputs
-    assert processed_inputs["type"] == "token"
-    assert "mm_processor_kwargs" in processed_inputs
-    return processed_inputs["mm_processor_kwargs"]["max_dynamic_patch"]
-
-
-@pytest.mark.parametrize(
-    "init_max_dynamic_patch,inference_max_dynamic_patch", [
-        (None, None),
-        (MAX_DYNAMIC_PATCH_OVERRIDE, None),
-        (DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE),
-    ])
-def test_input_processor_kwargs(use_processor_mock, init_max_dynamic_patch,
-                                inference_max_dynamic_patch):
-    """Ensure input processors can use processor kwargs."""
-    dummy_registry = InputRegistry()
-
-    (init_kwargs, inference_kwargs,
-     expected_seq_count) = _get_max_dynamic_patch_info(
-         init_max_dynamic_patch, inference_max_dynamic_patch)
-
-    ctx = build_model_context(DUMMY_MODEL_ID, mm_processor_kwargs=init_kwargs)
-    processor = dummy_registry.create_input_processor(ctx.model_config)
-    max_dynamic_patch_val = _get_processed_max_dynamic_patch(
-        processor, inference_kwargs)
-
-    assert max_dynamic_patch_val == expected_seq_count
-
-
-@pytest.mark.parametrize(
-    "mm_processor_kwargs",
-    [
-        # Not part of the signature
-        {
-            "does_not_exist": 100
-        },
-        # Part of the signature, not keyword only
-        {
-            "ctx": "something bad"
-        }
-    ])
-def test_processor_with_sad_kwarg_overrides(use_processor_mock,
-                                            mm_processor_kwargs):
-    """Ensure that input processors filter out invalid mm_processor_kwargs"""
-    dummy_registry = InputRegistry()
-    # Should filter out the init time kwargs
-    ctx = build_model_context(DUMMY_MODEL_ID,
-                              mm_processor_kwargs=mm_processor_kwargs)
-
-    processor = dummy_registry.create_input_processor(ctx.model_config)
-    # Should filter out the inference time kwargs
-    max_dynamic_patch_val = _get_processed_max_dynamic_patch(
-        processor, mm_processor_kwargs)
-    assert max_dynamic_patch_val == DEFAULT_MAX_DYNAMIC_PATCH
-
-
-### Test overrides for the dummy data
-@pytest.mark.parametrize("max_dynamic_patch",
-                         [None, MAX_DYNAMIC_PATCH_OVERRIDE])
-def test_dummy_data_kwarg_overrides(use_dummy_data_mock, max_dynamic_patch):
-    """Ensure dummy data factories can use processor kwargs."""
-    mm_processor_kwargs = None if max_dynamic_patch is None else {
-        "max_dynamic_patch": max_dynamic_patch
-    }
-    expected_seq_count = (DEFAULT_MAX_DYNAMIC_PATCH
-                          if max_dynamic_patch is None else max_dynamic_patch)
-    dummy_registry = InputRegistry()
-    ctx = build_model_context(DUMMY_MODEL_ID,
-                              mm_processor_kwargs=mm_processor_kwargs)
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-
-    # NOTE: seq_len is thrown away here since this will leverage the
-    # default dummy data factory that we have patched in, whose seq
-    # len is solely dependent on the value of the mm_processor_kwargs.
-    dummy_data = dummy_registry.dummy_data_for_profiling(
-        ctx.model_config, seq_len=-1, mm_registry=mm_registry)
-    assert len(dummy_data.seq_data.prompt_token_ids) == expected_seq_count
-
-
-@pytest.mark.parametrize(
-    "mm_processor_kwargs",
-    [
-        # Not part of the signature
-        {
-            "does_not_exist": 100
-        },
-        # Part of the signature, not keyword only
-        {
-            "ctx": "something bad"
-        }
-    ])
-def test_dummy_data_with_sad_kwarg_overrides(use_dummy_data_mock,
-                                             mm_processor_kwargs):
-    """Ensure the dummy data factory filters out invalid mm_processor_kwargs"""
-    dummy_registry = InputRegistry()
-    ctx = build_model_context(DUMMY_MODEL_ID,
-                              mm_processor_kwargs=mm_processor_kwargs)
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-
-    # NOTE: seq_len is thrown away here since this will leverage the
-    # default dummy data factory that we have patched in, whose seq
-    # len is solely dependent on the value of the mm_processor_kwargs.
-    dummy_data = dummy_registry.dummy_data_for_profiling(
-        ctx.model_config, seq_len=-1, mm_registry=mm_registry)
-    assert len(
-        dummy_data.seq_data.prompt_token_ids) == DEFAULT_MAX_DYNAMIC_PATCH
-
-
-### Test overrides for the max token count per multimodal instance
-@pytest.mark.parametrize("max_dynamic_patch",
-                         [None, MAX_DYNAMIC_PATCH_OVERRIDE])
-def test_max_tokens_kwarg_overrides(max_dynamic_patch):
-    """Ensure max token calcs can use processor kwargs."""
-    mm_processor_kwargs = None if max_dynamic_patch is None else {
-        "max_dynamic_patch": max_dynamic_patch
-    }
-    expected_seq_count = (DEFAULT_MAX_DYNAMIC_PATCH
-                          if max_dynamic_patch is None else max_dynamic_patch)
-
-    ctx = build_model_context(MULTIMODAL_MODEL_ID,
-                              task="generate",
-                              trust_remote_code=True,
-                              mm_processor_kwargs=mm_processor_kwargs,
-                              limit_mm_per_prompt={"image": 1})
-
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-    # Patch the image registry for phi3v with our lambda that is compatible
-    # with overrides, then ensure that calling the method correctly echos
-    # our max_dynamic_patch value back from the mm_processor_kwargs.
-    with patch.object(
-            mm_registry._get_plugin("image"),
-            "_max_mm_tokens",
-        {mm_model_cls(): get_max_dynamic_patch},
-    ):
-        max_multimodal_tokens = mm_registry.get_max_multimodal_tokens(
-            ctx.model_config)
-
-    assert expected_seq_count == max_multimodal_tokens
-
-
-@pytest.mark.parametrize(
-    "mm_processor_kwargs",
-    [
-        # Not part of the signature
-        {
-            "does_not_exist": 100
-        },
-        # Part of the signature, not keyword only
-        {
-            "ctx": "something bad"
-        }
-    ])
-def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs):
-    """Ensure that max token calcs filters out invalid mm_processor_kwargs"""
-    ctx = build_model_context(MULTIMODAL_MODEL_ID,
-                              task="generate",
-                              trust_remote_code=True,
-                              mm_processor_kwargs=mm_processor_kwargs,
-                              limit_mm_per_prompt={"image": 1})
-
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-
-    # Similar before, but since these kwargs get filtered,
-    # we always get our default value back.
-    with patch.object(
-            mm_registry._get_plugin("image"),
-            "_max_mm_tokens",
-        {mm_model_cls(): get_max_dynamic_patch},
-    ):
-        max_multimodal_tokens = mm_registry.get_max_multimodal_tokens(
-            ctx.model_config)
-
-    assert max_multimodal_tokens == DEFAULT_MAX_DYNAMIC_PATCH
-
-
-### Test overrides for the mapper
-@pytest.mark.parametrize(
-    "max_dynamic_patch",
-    [DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE])
-def test_default_mapper_with_processor_kwargs(image_assets, max_dynamic_patch):
-    """Ensure that the mapper processor kwargs can fall back to HF models."""
-    # NOTE - we don't validate bad inputs for the default mapper, because it's
-    # through the automodel interface in transformers, so we can't easily
-    # inspect what kwargs are or are not allowed.
-    ctx = build_model_context(
-        MULTIMODAL_MODEL_ID,
-        task="generate",
-        trust_remote_code=True,
-        mm_processor_kwargs={"max_dynamic_patch": max_dynamic_patch},
-        limit_mm_per_prompt={"image": 1})
-
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-
-    image = image_assets[0].pil_image
-    mm_inputs = {"image": image}
-
-    mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs)
-    # pixel vals should have shape: [batch, max_dynamic_patch+1, ...]
-    assert mapped_inputs["pixel_values"].shape[1] == max_dynamic_patch + 1
-
-
-@pytest.mark.parametrize(
-    "init_max_dynamic_patch,inference_max_dynamic_patch", [
-        (None, None),
-        (MAX_DYNAMIC_PATCH_OVERRIDE, None),
-        (DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE),
-    ])
-def test_custom_mapper_kwarg_overrides(image_assets, init_max_dynamic_patch,
-                                       inference_max_dynamic_patch):
-    """Ensure custom mappers can use processor kwargs."""
-    (init_kwargs, inference_kwargs,
-     expected_seq_count) = _get_max_dynamic_patch_info(
-         init_max_dynamic_patch, inference_max_dynamic_patch)
-
-    ctx = build_model_context(MULTIMODAL_MODEL_ID,
-                              task="generate",
-                              trust_remote_code=True,
-                              mm_processor_kwargs=init_kwargs,
-                              limit_mm_per_prompt={"image": 1})
-
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-    image = image_assets[0].pil_image
-    mm_inputs = {"image": image}
-
-    # Patch the image registry for phi3v with our lambda that is compatible
-    # with overrides, then ensure that calling the method correctly echos
-    # our max_dynamic_patch value back from the mm_processor_kwargs.
-    mm_registry._get_plugin("image").register_input_mapper(custom_mapper)(
-        mm_model_cls())
-    mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs,
-                                          inference_kwargs)
-
-    assert mapped_inputs["pixel_values"].shape[1] == expected_seq_count + 1
-
-
-@pytest.mark.parametrize(
-    "mm_processor_kwargs",
-    [
-        # Not part of the signature
-        {
-            "does_not_exist": 100
-        },
-        # Part of the signature, not keyword only
-        {
-            "ctx": "something bad"
-        }
-    ])
-def test_custom_mapper_with_sad_kwarg_overrides(image_assets,
-                                                mm_processor_kwargs):
-    """Ensure that custom mappers filters out invalid mm_processor_kwargs"""
-    # Should filter out the init time kwargs
-    ctx = build_model_context(MULTIMODAL_MODEL_ID,
-                              task="generate",
-                              trust_remote_code=True,
-                              mm_processor_kwargs=mm_processor_kwargs,
-                              limit_mm_per_prompt={"image": 1})
-
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-    image = image_assets[0].pil_image
-    mm_inputs = {"image": image}
-
-    # Patch the image registry for phi3v with our lambda that is compatible
-    # with overrides, then ensure that calling the method correctly echos
-    # our max_dynamic_patch value back from the mm_processor_kwargs.
-    mm_registry._get_plugin("image").register_input_mapper(custom_mapper)(
-        mm_model_cls())
-    # Should filter out the inference time kwargs
-    mapped_inputs = mm_registry.map_input(
-        ctx.model_config, mm_inputs, mm_processor_kwargs=mm_processor_kwargs)
-
-    assert mapped_inputs["pixel_values"].shape[1] == (
-        DEFAULT_MAX_DYNAMIC_PATCH + 1)
-- 
GitLab


From 233df6f5c4520ae57e4a24acfbaedcc9ce166074 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Wed, 5 Feb 2025 00:46:54 +0000
Subject: [PATCH 37/65] [V1][Metrics] Add request_success_total counter,
 labelled with finish reason (#12579)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py |  1 +
 vllm/v1/engine/__init__.py               | 21 +++++++++++++++++++--
 vllm/v1/engine/detokenizer.py            |  9 +++++----
 vllm/v1/engine/output_processor.py       | 22 ++++++++++++----------
 vllm/v1/metrics/loggers.py               | 15 ++++++++++++++-
 vllm/v1/metrics/stats.py                 | 10 +++++++---
 vllm/v1/request.py                       | 15 ++++++++-------
 7 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index a9134be62..de2333901 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -205,6 +205,7 @@ EXPECTED_METRICS_V1 = [
     "vllm:gpu_cache_usage_perc",
     "vllm:prompt_tokens_total",
     "vllm:generation_tokens_total",
+    "vllm:request_success_total",
     "vllm:request_prompt_tokens_sum",
     "vllm:request_prompt_tokens_bucket",
     "vllm:request_prompt_tokens_count",
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 912b92862..6bd548bdc 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -15,6 +15,23 @@ if TYPE_CHECKING:
     from vllm.sampling_params import SamplingParams
 
 
+class RequestFinishedReason(enum.IntEnum):
+    """
+    Reason a request finished - stop, length, or abort.
+
+    stop - a stop string was emitted
+    length - max_tokens was consumed, or max_model_len was reached
+    abort - aborted for another reason
+
+    """
+    STOP = 0
+    LENGTH = 1
+    ABORT = 2
+
+    def __str__(self):
+        return self.name.lower()
+
+
 @dataclass
 class EngineCoreRequest:
 
@@ -45,7 +62,7 @@ class EngineCoreOutput(
     request_id: str
     new_token_ids: List[int]
     finished: bool
-    finish_reason: Optional[str] = None
+    finish_reason: Optional[RequestFinishedReason] = None
     stop_reason: Union[int, str, None] = None
 
 
@@ -56,7 +73,7 @@ class EngineCoreOutputs(
         gc=False):  # type: ignore[call-arg]
 
     #NOTE(Nick): We could consider ways to make this more compact,
-    # e.g. columnwise layout and using an int enum for finish/stop reason
+    # e.g. columnwise layout
 
     # [num_reqs]
     outputs: List[EngineCoreOutput]
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 6d800f026..2bce23e68 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -8,7 +8,8 @@ from vllm.logger import init_logger
 from vllm.sampling_params import RequestOutputKind
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
-from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine import (EngineCoreOutput, EngineCoreRequest,
+                            RequestFinishedReason)
 
 logger = init_logger(__name__)
 
@@ -18,7 +19,7 @@ class DetokenizerOutput:
     output_text: str
     token_ids: List[int]
     finished: bool
-    finish_reason: Optional[str] = None
+    finish_reason: Optional[RequestFinishedReason] = None
     stop_reason: Union[int, str, None] = None
 
 
@@ -147,13 +148,13 @@ class IncrementalDetokenizer:
                 stop_str, truncate_to = stop
                 if truncate_to != -1:
                     self.output_text = self.output_text[:truncate_to]
-                finish_reason = "stop"  # TODO: use constant
+                finish_reason = RequestFinishedReason.STOP
                 stop_reason = stop_str
 
         # TODO: handle stop_token_ids here too?
 
         # 3) Update the RequestOutput object with the new text.
-        finished = bool(finish_reason)
+        finished = finish_reason is not None
         if self.output_kind == RequestOutputKind.FINAL_ONLY \
             and not finished:
             return None
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index aeefd5239..947366691 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -161,8 +161,10 @@ class OutputProcessor:
                 engine_core_output)
 
             # 3) Create and handle RequestOutput objects.
-            if request_output := self._make_request_output(
-                    req_state, detokenizer_output):
+            if detokenizer_output is not None:
+                request_output = self._make_request_output(
+                    req_state, detokenizer_output)
+
                 if req_state.queue is not None:
                     # AsyncLLM: put into queue for handling by generate().
                     req_state.queue.put_nowait(request_output)
@@ -172,6 +174,8 @@ class OutputProcessor:
 
                 # Free completed requests.
                 if request_output.finished:
+                    assert detokenizer_output.finish_reason is not None
+
                     self.request_states.pop(req_id)
                     if not engine_core_output.finished:
                         # If req not finished in EngineCore, but Detokenizer
@@ -180,7 +184,8 @@ class OutputProcessor:
 
                     # Track per-request stats
                     iteration_stats.update_from_finished_request(
-                        request_output, req_state.stats)
+                        detokenizer_output.finish_reason, request_output,
+                        req_state.stats)
 
         return OutputProcessorOutput(
             request_outputs=request_outputs,
@@ -191,12 +196,8 @@ class OutputProcessor:
     @staticmethod
     def _make_request_output(
         request_state: RequestState,
-        detokenizer_output: Optional[DetokenizerOutput],
-    ) -> Optional[RequestOutput]:
-
-        if detokenizer_output is None:
-            return None
-
+        detokenizer_output: DetokenizerOutput,
+    ) -> RequestOutput:
         request_output = RequestOutput.new(
             request_state.request_id,
             request_state.prompt,
@@ -207,7 +208,8 @@ class OutputProcessor:
         )
         if detokenizer_output.finished:
             completion_output = request_output.outputs[0]
-            completion_output.finish_reason = detokenizer_output.finish_reason
+            completion_output.finish_reason = str(
+                detokenizer_output.finish_reason)
             completion_output.stop_reason = detokenizer_output.stop_reason
 
         return request_output
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index f736e38f1..b62351a8f 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -2,13 +2,14 @@
 
 import time
 from abc import ABC, abstractmethod
-from typing import List
+from typing import Dict, List
 
 import numpy as np
 import prometheus_client
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
+from vllm.v1.engine import RequestFinishedReason
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
@@ -116,6 +117,17 @@ class PrometheusStatLogger(StatLoggerBase):
             documentation="Number of generation tokens processed.",
             labelnames=labelnames).labels(*labelvalues)
 
+        self.counter_request_success: Dict[RequestFinishedReason,
+                                           prometheus_client.Counter] = {}
+        counter_request_success_base = prometheus_client.Counter(
+            name="vllm:request_success_total",
+            documentation="Count of successfully processed requests.",
+            labelnames=labelnames + ["finished_reason"])
+        for reason in RequestFinishedReason:
+            self.counter_request_success[
+                reason] = counter_request_success_base.labels(*(labelvalues +
+                                                                [str(reason)]))
+
         self.histogram_num_prompt_tokens_request = \
             prometheus_client.Histogram(
                 name="vllm:request_prompt_tokens",
@@ -163,6 +175,7 @@ class PrometheusStatLogger(StatLoggerBase):
             iteration_stats.num_generation_tokens)
 
         for finished_request in iteration_stats.finished_requests:
+            self.counter_request_success[finished_request.finish_reason].inc()
             self.histogram_num_prompt_tokens_request.observe(
                 finished_request.num_prompt_tokens)
             self.histogram_num_generation_tokens_request.observe(
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 88f2c0835..36c95e07d 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List
 
 if TYPE_CHECKING:
     from vllm.outputs import RequestOutput
-    from vllm.v1.engine import EngineCoreOutput
+    from vllm.v1.engine import EngineCoreOutput, RequestFinishedReason
 
 
 @dataclass
@@ -32,6 +32,7 @@ class RequestStateStats:
 class FinishedRequestStats:
     """Stats associated with a finished request."""
 
+    finish_reason: "RequestFinishedReason"
     num_prompt_tokens: int = 0
     num_generation_tokens: int = 0
 
@@ -73,8 +74,11 @@ class IterationStats:
         request_state_stats.num_generation_tokens += num_new_generation_tokens
         request_state_stats.last_token_time = now
 
-    def update_from_finished_request(self, request_output: "RequestOutput",
+    def update_from_finished_request(self,
+                                     finish_reason: "RequestFinishedReason",
+                                     request_output: "RequestOutput",
                                      request_state_stats: RequestStateStats):
         self.finished_requests.append(
-            FinishedRequestStats(len(request_output.prompt_token_ids),
+            FinishedRequestStats(finish_reason,
+                                 len(request_output.prompt_token_ids),
                                  request_state_stats.num_generation_tokens))
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 0519d9e78..eb9bf99b4 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List, Optional, Union
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import RequestMetrics
-from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine import EngineCoreRequest, RequestFinishedReason
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
@@ -109,7 +109,7 @@ class Request:
     def is_finished(self) -> bool:
         return RequestStatus.is_finished(self.status)
 
-    def get_finished_reason(self) -> Union[str, None]:
+    def get_finished_reason(self) -> Union[RequestFinishedReason, None]:
         return RequestStatus.get_finished_reason(self.status)
 
     def has_encoder_inputs(self) -> bool:
@@ -149,7 +149,8 @@ class RequestStatus(enum.IntEnum):
         return status > RequestStatus.PREEMPTED
 
     @staticmethod
-    def get_finished_reason(status: "RequestStatus") -> Union[str, None]:
+    def get_finished_reason(
+            status: "RequestStatus") -> Union[RequestFinishedReason, None]:
         return _FINISHED_REASON_MAP.get(status)
 
 
@@ -158,8 +159,8 @@ class RequestStatus(enum.IntEnum):
 # are longer than the model's length cap. Therefore, the stop
 # reason should also be "length" as in OpenAI API.
 _FINISHED_REASON_MAP = {
-    RequestStatus.FINISHED_STOPPED: "stop",
-    RequestStatus.FINISHED_LENGTH_CAPPED: "length",
-    RequestStatus.FINISHED_ABORTED: "abort",
-    RequestStatus.FINISHED_IGNORED: "length",
+    RequestStatus.FINISHED_STOPPED: RequestFinishedReason.STOP,
+    RequestStatus.FINISHED_LENGTH_CAPPED: RequestFinishedReason.LENGTH,
+    RequestStatus.FINISHED_ABORTED: RequestFinishedReason.ABORT,
+    RequestStatus.FINISHED_IGNORED: RequestFinishedReason.LENGTH,
 }
-- 
GitLab


From 75e94309e8d8919e0daea041f6cd81a4b8c09060 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 4 Feb 2025 21:22:24 -0500
Subject: [PATCH 38/65] [Perf] Mem align KV caches for CUDA devices (MLA perf
 improvement) (#12676)

Signed-off-by: simon-mo <xmo@berkeley.edu>
Signed-off-by: Lucas Wilkinson <lcwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
---
 csrc/cache.h                                  |   3 +
 csrc/cache_kernels.cu                         |  82 +++++-
 csrc/torch_bindings.cpp                       |   4 +
 tests/kernels/test_cache.py                   | 262 ++++++++++++++++++
 vllm/_custom_ops.py                           |   5 +
 vllm/attention/backends/triton_mla.py         |   5 +-
 vllm/attention/ops/triton_decode_attention.py |  16 +-
 vllm/envs.py                                  |  10 +
 vllm/utils.py                                 |  10 +
 vllm/worker/cache_engine.py                   |  66 ++++-
 10 files changed, 429 insertions(+), 34 deletions(-)

diff --git a/csrc/cache.h b/csrc/cache.h
index 55ed30bd8..cf4a65c29 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -15,6 +15,9 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
                  std::vector<torch::Tensor> const& value_caches,
                  const torch::Tensor& block_mapping);
 
+void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
+                     const torch::Tensor& block_mapping);
+
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,
                        torch::Tensor& slot_mapping,
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 23a46b6ed..0960888d1 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -46,7 +46,10 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
   char* src_ptr = static_cast<char*>(src.data_ptr());
   char* dst_ptr = static_cast<char*>(dst.data_ptr());
 
-  const int64_t block_size_in_bytes = src.element_size() * src[0].numel();
+  // We use the stride instead of numel in case the cache is padded for memory
+  // alignment reasons, we assume the blocks data (inclusive of any padding)
+  // is contiguous in memory
+  const int64_t block_size_in_bytes = src.element_size() * src.stride(0);
   const at::cuda::OptionalCUDAGuard device_guard(
       src_device.is_cuda() ? src_device : dst_device);
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -93,6 +96,24 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,
   }
 }
 
+// Kernel for MLA, which works on a single joint kv_cache
+// Grid: (num_layers, num_pairs)
+template <typename scalar_t>
+__global__ void copy_blocks_mla_kernel(
+    int64_t* cache_ptrs, const int64_t* __restrict__ block_mapping,
+    const int mem_footprint_per_block) {
+  const int layer_idx = blockIdx.x;
+  const int pair_idx = blockIdx.y;
+  scalar_t* cache = reinterpret_cast<scalar_t*>(cache_ptrs[layer_idx]);
+  int64_t src_block = block_mapping[2 * pair_idx];
+  int64_t dst_block = block_mapping[2 * pair_idx + 1];
+  int64_t src_offset = src_block * mem_footprint_per_block;
+  int64_t dst_offset = dst_block * mem_footprint_per_block;
+  for (int i = threadIdx.x; i < mem_footprint_per_block; i += blockDim.x) {
+    cache[dst_offset + i] = cache[src_offset + i];
+  }
+}
+
 }  // namespace vllm
 
 // Note: the key_caches and value_caches vectors are constant but
@@ -147,6 +168,42 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
       }));
 }
 
+// copy blocks kernel for MLA (assumes a joint KV-cache)
+void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
+                     const torch::Tensor& block_mapping) {
+  int num_layers = kv_caches.size();
+  if (num_layers == 0) {
+    return;
+  }
+  torch::Device cache_device = kv_caches[0].device();
+  TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
+
+  std::vector<int64_t> cache_ptrs(num_layers);
+  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
+    cache_ptrs[layer_idx] =
+        reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
+  }
+  torch::Tensor cache_ptrs_tensor =
+      torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
+          .to(cache_device);
+
+  int num_pairs = block_mapping.size(0);
+  // We use the stride instead of numel in case the cache is padded for memory
+  // alignment reasons, we assume the blocks data (inclusive of any padding)
+  // is contiguous in memory
+  int mem_footprint_per_block = kv_caches[0].stride(0);
+  dim3 grid(num_layers, num_pairs);
+  dim3 block(std::min(1024, mem_footprint_per_block));
+  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
+      kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
+        vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
+            cache_ptrs_tensor.data_ptr<int64_t>(),
+            block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
+      }));
+}
+
 namespace vllm {
 
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
@@ -254,6 +311,7 @@ __global__ void concat_and_cache_mla_kernel(
                                      // + pe_dim)]
     const int64_t* __restrict__ slot_mapping,  // [num_tokens]
     const int block_stride,                    //
+    const int entry_stride,                    //
     const int kv_c_stride,                     //
     const int k_pe_stride,                     //
     const int kv_lora_rank,                    //
@@ -274,9 +332,8 @@ __global__ void concat_and_cache_mla_kernel(
                   int src_stride, int dst_stride, int size, int offset) {
     for (int i = threadIdx.x; i < size; i += blockDim.x) {
       const int64_t src_idx = token_idx * src_stride + i;
-      const int64_t dst_idx = block_idx * block_stride +
-                              block_offset * (kv_lora_rank + pe_dim) + i +
-                              offset;
+      const int64_t dst_idx =
+          block_idx * block_stride + block_offset * entry_stride + i + offset;
       if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
         dst[dst_idx] = src[src_idx];
       } else {
@@ -391,14 +448,14 @@ void reshape_and_cache_flash(
 // KV_T is the stored data type of kv-cache.
 // CACHE_T is the data type of key and value tensors.
 // KV_DTYPE is the real data type of kv-cache.
-#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)             \
-  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>           \
-      <<<grid, block, 0, stream>>>(                                    \
-          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                    \
-          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                    \
-          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),             \
-          slot_mapping.data_ptr<int64_t>(), block_stride, kv_c_stride, \
-          k_pe_stride, kv_lora_rank, pe_dim, block_size,               \
+#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)              \
+  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>            \
+      <<<grid, block, 0, stream>>>(                                     \
+          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
+          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
+          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
+          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
+          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
           reinterpret_cast<const float*>(scale.data_ptr()));
 
 void concat_and_cache_mla(
@@ -428,6 +485,7 @@ void concat_and_cache_mla(
   int kv_c_stride = kv_c.stride(0);
   int k_pe_stride = k_pe.stride(0);
   int block_stride = kv_cache.stride(0);
+  int entry_stride = kv_cache.stride(1);
 
   dim3 grid(num_tokens);
   dim3 block(std::min(kv_lora_rank, 512));
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 186e9c0e8..c03806f43 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -450,6 +450,10 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "Tensor block_mapping) -> ()");
   cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
 
+  cache_ops.def(
+      "copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()");
+  cache_ops.impl("copy_blocks_mla", torch::kCUDA, &copy_blocks_mla);
+
   // Reshape the key and value tensors and cache them.
   cache_ops.def(
       "reshape_and_cache(Tensor key, Tensor value,"
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 6f909b680..21c02c5de 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -9,6 +9,7 @@ import torch
 from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
+from vllm.utils import align_to_256bytes
 
 COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -18,6 +19,13 @@ NUM_HEADS = [8]  # Arbitrary values for testing
 HEAD_SIZES = [64, 80, 120, 256]
 BLOCK_SIZES = [8, 16, 32]
 
+# Parameters for MLA tests.
+KV_LORA_RANKS = [512]
+QK_ROPE_HEAD_DIMS = [64]
+NUM_TOKENS_MLA = [42]
+BLOCK_SIZES_MLA = [16]
+NUM_BLOCKS_MLA = [8]
+
 # Arbitrary values for testing
 # don't make it too large. e.g. [1024, 36000] will OOM
 NUM_BLOCKS = [1024, 10000]
@@ -432,3 +440,257 @@ def test_fp8_e4m3_conversion(
     ops.convert_fp8(converted_cache, cache_fp8)
 
     torch.testing.assert_close(cache, converted_cache, atol=0.001, rtol=0.1)
+
+
+def _create_mla_cache(
+    num_blocks: int,
+    block_size: int,
+    entry_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    device: str,
+    align_cache: bool,
+) -> torch.Tensor:
+    cache_dtype = torch.uint8 if kv_cache_dtype == "fp8" else dtype
+
+    if align_cache:
+        alloc_entry_size = align_to_256bytes(entry_size, cache_dtype)
+        alloc_shape = (num_blocks, block_size, alloc_entry_size)
+        cache_full = torch.zeros(alloc_shape, dtype=cache_dtype, device=device)
+        cache = cache_full[..., :entry_size]
+    else:
+        cache = torch.zeros(num_blocks,
+                            block_size,
+                            entry_size,
+                            dtype=cache_dtype,
+                            device=device)
+    return cache
+
+
+def _fill_mla_cache(cache: torch.Tensor, kv_cache_dtype: str):
+    rand_dtype = torch.float16 if kv_cache_dtype == "fp8" else cache.dtype
+
+    vals = torch.randn(*cache.shape, device=cache.device, dtype=rand_dtype)
+    if kv_cache_dtype == "fp8":
+        temp = torch.zeros_like(cache)
+        ops.convert_fp8(temp, vals, 1.0, kv_dtype=kv_cache_dtype)
+        vals = temp
+    cache.copy_(vals)
+
+
+@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
+@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@pytest.mark.parametrize("align_cache", [False])
+@torch.inference_mode()
+def test_concat_and_cache_mla(
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+    num_tokens: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+    align_cache: bool,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    total_slots = num_blocks * block_size
+    slot_mapping_lst = random.sample(range(total_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst,
+                                dtype=torch.long,
+                                device=device)
+
+    kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device)
+    k_pe = torch.randn(num_tokens,
+                       qk_rope_head_dim,
+                       dtype=dtype,
+                       device=device)
+    entry_size = kv_lora_rank + qk_rope_head_dim
+
+    scale = torch.tensor(0.1, dtype=torch.float32, device=device)
+    kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                 kv_cache_dtype, device, align_cache)
+    ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device)
+
+    for i in range(num_tokens):
+        slot = slot_mapping[i].item()
+        block_idx = slot // block_size
+        block_offset = slot % block_size
+        ref_temp[block_idx, block_offset, :kv_lora_rank] = kv_c[i]
+        ref_temp[block_idx, block_offset, kv_lora_rank:] = k_pe[i]
+
+    if kv_cache_dtype == "fp8":
+        ref_kv_cache = torch.empty_like(ref_temp, dtype=kv_cache.dtype)
+        ops.convert_fp8(ref_kv_cache,
+                        ref_temp,
+                        scale.item(),
+                        kv_dtype=kv_cache_dtype)
+    else:
+        ref_kv_cache = ref_temp
+
+    opcheck(
+        torch.ops._C_cache_ops.concat_and_cache_mla,
+        (kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+
+    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping,
+                             kv_cache_dtype, scale)
+
+    if kv_cache_dtype == "fp8":
+        result_temp = torch.empty_like(kv_cache, dtype=torch.float16)
+        ops.convert_fp8(result_temp,
+                        kv_cache.contiguous(),
+                        scale.item(),
+                        kv_dtype=kv_cache_dtype)
+        expected_temp = torch.empty_like(ref_kv_cache, dtype=torch.float16)
+        ops.convert_fp8(expected_temp,
+                        ref_kv_cache,
+                        scale.item(),
+                        kv_dtype=kv_cache_dtype)
+        torch.testing.assert_close(result_temp,
+                                   expected_temp,
+                                   atol=0.001,
+                                   rtol=0.1)
+    else:
+        torch.testing.assert_close(kv_cache, ref_kv_cache)
+
+
+@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
+@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
+@pytest.mark.parametrize("num_layers", NUM_LAYERS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@pytest.mark.parametrize("align_cache", [False, True])
+@torch.inference_mode()
+def test_copy_blocks_mla(
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+    block_size: int,
+    num_blocks: int,
+    num_layers: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+    align_cache: bool,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    entry_size = kv_lora_rank + qk_rope_head_dim
+
+    kv_caches = []
+    for _ in range(num_layers):
+        kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                     kv_cache_dtype, device, align_cache)
+        _fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype)
+        kv_caches.append(kv_cache)
+
+    ref_caches = [kv_cache.clone() for kv_cache in kv_caches]
+
+    num_mappings = min(2, num_blocks // 2)
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    remaining = list(set(range(num_blocks)) - set(src_blocks))
+    dst_blocks = random.sample(remaining, 2 * num_mappings)
+    block_mapping = []
+    for i in range(num_mappings):
+        src = src_blocks[i]
+        dst1 = dst_blocks[2 * i]
+        dst2 = dst_blocks[2 * i + 1]
+        block_mapping.append((src, dst1))
+        block_mapping.append((src, dst2))
+    block_mapping_tensor = torch.tensor(block_mapping,
+                                        dtype=torch.int64,
+                                        device=device).view(-1, 2)
+
+    for src, dst in block_mapping:
+        for ref_cache in ref_caches:
+            ref_cache[dst].copy_(ref_cache[src])
+
+    opcheck(
+        torch.ops._C_cache_ops.copy_blocks_mla,
+        (kv_caches, block_mapping_tensor),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+    ops.copy_blocks_mla(kv_caches, block_mapping_tensor)
+
+    for kv_cache, ref_cache in zip(kv_caches, ref_caches):
+        torch.testing.assert_close(kv_cache, ref_cache)
+
+
+@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
+@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@pytest.mark.parametrize("align_cache", [False, True])
+@torch.inference_mode()
+def test_swap_blocks_mla(
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+    align_cache: bool,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    entry_size = kv_lora_rank + qk_rope_head_dim
+
+    src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                  kv_cache_dtype, device, align_cache)
+    dst_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                  kv_cache_dtype, device, align_cache)
+
+    _fill_mla_cache(src_cache, kv_cache_dtype)
+    _fill_mla_cache(dst_cache, kv_cache_dtype)
+
+    src_cache_clone = src_cache.clone()
+
+    num_mappings = min(2, num_blocks // 2)
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
+    dst_blocks = random.sample(remaining_blocks, num_mappings)
+    block_mapping = list(zip(src_blocks, dst_blocks))
+    block_mapping_tensor = torch.tensor(block_mapping,
+                                        dtype=torch.int64,
+                                        device="cpu").view(-1, 2)
+
+    opcheck(
+        torch.ops._C_cache_ops.swap_blocks,
+        (src_cache, dst_cache, block_mapping_tensor),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+        cond=(kv_lora_rank == KV_LORA_RANKS[0]
+              and qk_rope_head_dim == QK_ROPE_HEAD_DIMS[0]),
+    )
+
+    ops.swap_blocks(src_cache, dst_cache, block_mapping_tensor)
+
+    for src, dst in block_mapping:
+        torch.testing.assert_close(
+            src_cache_clone[src].cpu(),
+            dst_cache[dst].cpu(),
+            msg=f"Block {src} from src should have been swapped to block "
+            f"{dst} in dst_cache.")
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index bdc9a6a33..a68235016 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1037,6 +1037,11 @@ def copy_blocks(key_caches: List[torch.Tensor],
     torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
 
 
+def copy_blocks_mla(kv_caches: List[torch.Tensor],
+                    block_mapping: torch.Tensor) -> None:
+    torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping)
+
+
 def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
                 block_mapping: torch.Tensor) -> None:
     torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping)
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
index 20d7ef0fa..9a1984a93 100644
--- a/vllm/attention/backends/triton_mla.py
+++ b/vllm/attention/backends/triton_mla.py
@@ -26,7 +26,6 @@ from vllm.attention.backends.mla.utils import MLACommonImpl, MLACommonMetadata
 from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
-from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 
@@ -72,14 +71,14 @@ class TritonMLABackend(AttentionBackend):
         dst_kv_cache: torch.Tensor,
         src_to_dst: torch.Tensor,
     ) -> None:
-        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+        ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
 
     @staticmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
         src_to_dists: torch.Tensor,
     ) -> None:
-        PagedAttention.copy_blocks(kv_caches, src_to_dists)
+        ops.copy_blocks_mla(kv_caches, src_to_dists)
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py
index ec5ec4ce6..057fccb5e 100644
--- a/vllm/attention/ops/triton_decode_attention.py
+++ b/vllm/attention/ops/triton_decode_attention.py
@@ -204,10 +204,10 @@ def _decode_att_m_fwd(
         Req_to_tokens.stride(0),
         q.stride(0),
         q.stride(1),
-        k_buffer.stride(-2),
-        k_buffer.stride(-1),
-        v_buffer.stride(-2),
-        v_buffer.stride(-1),
+        k_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        k_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        v_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        v_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
         att_out.stride(0),
         att_out.stride(1),
         att_out.stride(2),
@@ -438,10 +438,10 @@ def _decode_grouped_att_m_fwd(
         Req_to_tokens.stride(0),
         q.stride(0),
         q.stride(1),
-        k_buffer.stride(-2),
-        k_buffer.stride(-1),
-        v_buffer.stride(-2),
-        v_buffer.stride(-1),
+        k_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        k_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        v_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        v_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
         att_out.stride(0),
         att_out.stride(1),
         att_out.stride(2),
diff --git a/vllm/envs.py b/vllm/envs.py
index 5018f6deb..2c731eda7 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -82,6 +82,7 @@ if TYPE_CHECKING:
     VLLM_MLA_DISABLE: bool = False
     VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
     VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
+    VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True
     VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
 
 
@@ -539,6 +540,15 @@ environment_variables: Dict[str, Callable[[], Any]] = {
     "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":
     lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
                  ),
+
+    # When on a Nvidia GPU aligns single entries (within a page) so they are 256
+    # byte aligned for better performance, this increases the memory usage of
+    # the cache. Currently this only affects MLA that results in non-256
+    # byte aligned entries. This matches the alignment the CUDA runtime uses
+    # for all allocations. Currently this primarily affects MLA, for most other
+    # models the alignment is already naturally aligned to 256 bytes.
+    "VLLM_CUDA_MEM_ALIGN_KV_CACHE":
+    lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
 }
 
 # end-env-vars-definition
diff --git a/vllm/utils.py b/vllm/utils.py
index a2b53fcf2..8b9269598 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -563,6 +563,10 @@ def cdiv(a: int, b: int) -> int:
     return -(a // -b)
 
 
+def round_up(x: int, y: int) -> int:
+    return ((x + y - 1) // y) * y
+
+
 def _generate_random_fp8(
     tensor: torch.Tensor,
     low: float,
@@ -794,6 +798,12 @@ def get_dtype_size(dtype: torch.dtype) -> int:
     return torch.tensor([], dtype=dtype).element_size()
 
 
+def align_to_256bytes(extent: int, dtype: torch.dtype) -> int:
+    dtype_size = get_dtype_size(dtype)
+    eles_per_256bytes = 256 // dtype_size
+    return round_up(extent, eles_per_256bytes)
+
+
 # `collections` helpers
 def is_list_of(
     value: object,
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 252fe0660..3960392cf 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -2,13 +2,17 @@
 """CacheEngine class for managing the KV cache."""
 from typing import List
 
+import numpy as np
 import torch
 
+from vllm import envs
 from vllm.attention import get_attn_backend
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType,
-                        get_dtype_size, is_pin_memory_available)
+                        align_to_256bytes, get_dtype_size,
+                        is_pin_memory_available)
 
 logger = init_logger(__name__)
 
@@ -38,6 +42,7 @@ class CacheEngine:
         self.num_attention_layers = model_config.get_num_layers_by_block_type(
             parallel_config, LayerBlockType.attention)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
+        self.align_cache = self._align_cache(model_config)
 
         self.block_size = cache_config.block_size
         self.num_gpu_blocks = cache_config.num_gpu_blocks
@@ -75,15 +80,39 @@ class CacheEngine:
             num_blocks, self.block_size, self.num_kv_heads, self.head_size)
         pin_memory = is_pin_memory_available() if device == "cpu" else False
         kv_cache: List[torch.Tensor] = []
+
+        # Align entries so they are 256 byte aligned for better performance
+        # Primarily targets MLA as this typically only ends up having entries
+        # be 128 byte aligned.
+        if self.align_cache:
+            # We assume the cache shape is:
+            #    (TOTAL_PAGES, PAGE_SIZE, entry_shape...)
+            # NOTE this assumption currently only holds for MLA so we only apply
+            # this optimization when `use_mla` is true
+            entry_shape = kv_cache_shape[2:]
+            entry_size = np.prod(entry_shape)
+            alloc_entry_size = align_to_256bytes(entry_size, self.dtype)
+            alloc_shape = (*kv_cache_shape[:2], alloc_entry_size)
+        else:
+            alloc_shape = kv_cache_shape
+
         for _ in range(self.num_attention_layers):
             # null block in CpuGpuBlockAllocator requires at least that
             # block to be zeroed-out.
             # We zero-out everything for simplicity.
-            kv_cache.append(
-                torch.zeros(kv_cache_shape,
-                            dtype=self.dtype,
-                            pin_memory=pin_memory,
-                            device=device))
+            layer_kv_cache = torch.zeros(alloc_shape,
+                                         dtype=self.dtype,
+                                         pin_memory=pin_memory,
+                                         device=device)
+
+            # If we allocated with padding for alignment reasons truncate the
+            # shape while preserving the aligned stride
+            if self.align_cache:
+                layer_kv_cache = layer_kv_cache[..., :entry_size]
+
+            # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases
+            # when entry_shape is higher than 1D
+            kv_cache.append(layer_kv_cache.view(kv_cache_shape))
         return kv_cache
 
     def swap_in(self, src_to_dst: torch.Tensor) -> None:
@@ -99,6 +128,14 @@ class CacheEngine:
     def copy(self, src_to_dsts: torch.Tensor) -> None:
         self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts)
 
+    @staticmethod
+    def _align_cache(model_config: ModelConfig):
+        # Currently align_cache only applies to MLA models since the other
+        # cache kernels haven't been updated yet to support non-continguous
+        # tensors
+        return model_config.use_mla and current_platform.is_cuda() \
+            and envs.VLLM_CUDA_MEM_ALIGN_KV_CACHE
+
     @staticmethod
     def get_cache_block_size(
         cache_config: CacheConfig,
@@ -110,14 +147,21 @@ class CacheEngine:
         num_attention_layers = model_config.get_num_layers_by_block_type(
             parallel_config, LayerBlockType.attention)
 
-        key_cache_block = cache_config.block_size * num_heads * head_size
-        # For MLA there is no value cache, since the latent vector
-        # is joint keys and values.
-        value_cache_block = key_cache_block if not model_config.use_mla else 0
-        total = num_attention_layers * (key_cache_block + value_cache_block)
         if cache_config.cache_dtype == "auto":
             dtype = model_config.dtype
         else:
             dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
+
+        key_cache_entry = num_heads * head_size
+        if CacheEngine._align_cache(model_config):
+            key_cache_entry = align_to_256bytes(key_cache_entry,
+                                                model_config.dtype)
+
+        # For MLA there is no value cache, since the latent vector
+        # is joint keys and values.
+        value_cache_entry = key_cache_entry if not model_config.use_mla else 0
+        total = num_attention_layers * cache_config.block_size * \
+            (key_cache_entry + value_cache_entry)
+
         dtype_size = get_dtype_size(dtype)
         return dtype_size * total
-- 
GitLab


From b3a0d01e4551118c735ee905c4ddc800ec603f24 Mon Sep 17 00:00:00 2001
From: Aviv Keshet <akeshet@gmail.com>
Date: Tue, 4 Feb 2025 18:46:26 -0800
Subject: [PATCH 39/65] [Core] add and implement
 `VLLM_LOGITS_PROCESSOR_THREADS` (#12368)

Signed-off-by: Aviv Keshet <akeshet@scaledcognition.com>
---
 vllm/envs.py                                  |  9 ++++
 .../model_executor/layers/logits_processor.py | 46 ++++++++++++++-----
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 2c731eda7..bb419dacb 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
     VLLM_LOGGING_LEVEL: str = "INFO"
     VLLM_LOGGING_PREFIX: str = ""
     VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
+    VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
     VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
@@ -282,6 +283,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
     "VLLM_LOGGING_PREFIX":
     lambda: os.getenv("VLLM_LOGGING_PREFIX", ""),
 
+    # if set, vllm will call logits processors in a thread pool with this many
+    # threads. This is useful when using custom logits processors that either
+    # (a) launch additional CUDA kernels or (b) do significant CPU-bound work
+    # while not holding the python GIL, or both.
+    "VLLM_LOGITS_PROCESSOR_THREADS":
+    lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0"))
+    if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None,
+
     # Trace function calls
     # If set to 1, vllm will trace function calls
     # Useful for debugging
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index ebf74c67d..cdc67ca83 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """A layer that compute logits from hidden_stats."""
 import inspect
+from concurrent.futures import ThreadPoolExecutor
 from typing import Optional
 
 import torch
@@ -15,6 +16,11 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.platforms import current_platform
 
+_logits_processor_threadpool: Optional[ThreadPoolExecutor] = None
+if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None:
+    _logits_processor_threadpool = ThreadPoolExecutor(
+        envs.VLLM_LOGITS_PROCESSOR_THREADS)
+
 
 class LogitsProcessor(nn.Module):
     """Process logits and apply logits processors from sampling metadata.
@@ -135,6 +141,7 @@ def _apply_logits_processors(
 ) -> torch.Tensor:
     found_logits_processors = False
     logits_processed = 0
+    logits_row_ids_and_logits_row_futures = []
     for seq_group in sampling_metadata.seq_groups:
         seq_ids = seq_group.seq_ids
         sampling_params = seq_group.sampling_params
@@ -148,22 +155,39 @@ def _apply_logits_processors(
                 past_tokens_ids = seq_group.seq_data[seq_id].output_token_ids
                 prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids
 
-                for logits_processor in logits_processors:
-                    parameters = inspect.signature(logits_processor).parameters
-                    if len(parameters) == 3:
-                        logits_row = logits_processor(prompt_tokens_ids,
-                                                      past_tokens_ids,
-                                                      logits_row)
-                    else:
-                        logits_row = logits_processor(past_tokens_ids,
-                                                      logits_row)
-
-                logits[logits_row_idx] = logits_row
+                if _logits_processor_threadpool is not None:
+                    logits_row_ids_and_logits_row_futures.append(
+                        (logits_row_idx,
+                         _logits_processor_threadpool.submit(
+                             _apply_logits_processors_single_seq, logits_row,
+                             logits_processors, past_tokens_ids,
+                             prompt_tokens_ids)))
+                else:
+                    logits[logits_row_idx] = \
+                        _apply_logits_processors_single_seq(
+                            logits_row, logits_processors, past_tokens_ids,
+                            prompt_tokens_ids)
 
         logits_processed += len(seq_group.sample_indices) + len(
             seq_group.prompt_logprob_indices)
 
+    for logits_row_idx, future in logits_row_ids_and_logits_row_futures:
+        logits[logits_row_idx] = future.result()
+
     if found_logits_processors:
         # verifies that no rows in logits were missed unexpectedly
         assert logits_processed == logits.shape[0]
     return logits
+
+
+def _apply_logits_processors_single_seq(logits_row, logits_processors,
+                                        past_tokens_ids,
+                                        prompt_tokens_ids) -> torch.Tensor:
+    for logits_processor in logits_processors:
+        parameters = inspect.signature(logits_processor).parameters
+        if len(parameters) == 3:
+            logits_row = logits_processor(prompt_tokens_ids, past_tokens_ids,
+                                          logits_row)
+        else:
+            logits_row = logits_processor(past_tokens_ids, logits_row)
+    return logits_row
-- 
GitLab


From 64862d106efa78032702f5fa5c110ccd6d654e9a Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Tue, 4 Feb 2025 19:58:22 -0800
Subject: [PATCH 40/65] [ROCM][AMD][TRITON] Halving warps number for fw_prefill
 to reduce spilling (#12713)

Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
---
 vllm/attention/ops/prefix_prefill.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index fbb6757ee..5fca16393 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -11,7 +11,7 @@ from vllm.platforms import current_platform
 
 # Static kernels parameters
 BASE_BLOCK = 128 if current_platform.has_device_capability(80) else 64
-NUM_WARPS = 8
+NUM_WARPS = 4 if current_platform.is_rocm() else 8
 
 # To check compatibility
 IS_TURING = current_platform.get_device_capability() == (7, 5)
-- 
GitLab


From 249824c3bfef6b9c03dd087569ef1e1072b2a4b0 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 5 Feb 2025 04:31:12 +0000
Subject: [PATCH 41/65] Refactor `Linear` handling in `TransformersModel`
 (#12727)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/layers/linear.py       | 30 ++++-----
 vllm/model_executor/models/transformers.py | 76 ++++++++++------------
 2 files changed, 48 insertions(+), 58 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 08f1e103e..da8db08fe 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -2,7 +2,7 @@
 
 import itertools
 from abc import abstractmethod
-from typing import Dict, List, Optional, Tuple
+from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -47,8 +47,8 @@ def adjust_marlin_shard(param, shard_size, shard_offset):
 
 
 def adjust_bitsandbytes_4bit_shard(param: Parameter,
-                                   shard_offsets: Dict[str, Tuple[int, int]],
-                                   loaded_shard_id: str) -> Tuple[int, int]:
+                                   shard_offsets: dict[str, tuple[int, int]],
+                                   loaded_shard_id: str) -> tuple[int, int]:
     """Adjust the quantization offsets and sizes for BitsAndBytes sharding."""
 
     total, _ = shard_offsets["total"]
@@ -90,7 +90,7 @@ class LinearMethodBase(QuantizeMethodBase):
     @abstractmethod
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
-                       output_partition_sizes: List[int], input_size: int,
+                       output_partition_sizes: list[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
         """Create weights for a linear layer. 
@@ -123,7 +123,7 @@ class UnquantizedLinearMethod(LinearMethodBase):
 
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
-                       output_partition_sizes: List[int], input_size: int,
+                       output_partition_sizes: list[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
         weight = Parameter(torch.empty(sum(output_partition_sizes),
@@ -179,7 +179,8 @@ class LinearBase(torch.nn.Module):
             self.quant_method = quant_config.get_quant_method(self,
                                                               prefix=prefix)
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self,
+                x: torch.Tensor) -> tuple[torch.Tensor, Optional[Parameter]]:
         raise NotImplementedError
 
 
@@ -240,9 +241,8 @@ class ReplicatedLinear(LinearBase):
         assert param.size() == loaded_weight.size()
         param.data.copy_(loaded_weight)
 
-    def forward(
-        self, x: torch.Tensor
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    def forward(self,
+                x: torch.Tensor) -> tuple[torch.Tensor, Optional[Parameter]]:
         bias = self.bias if not self.skip_bias_add else None
         assert self.quant_method is not None
         output = self.quant_method.apply(self, x, bias)
@@ -288,7 +288,7 @@ class ColumnParallelLinear(LinearBase):
                  skip_bias_add: bool = False,
                  params_dtype: Optional[torch.dtype] = None,
                  quant_config: Optional[QuantizationConfig] = None,
-                 output_sizes: Optional[List[int]] = None,
+                 output_sizes: Optional[list[int]] = None,
                  prefix: str = ""):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
                          quant_config, prefix)
@@ -374,7 +374,7 @@ class ColumnParallelLinear(LinearBase):
             loaded_weight = loaded_weight.reshape(1)
         param.load_column_parallel_weight(loaded_weight=loaded_weight)
 
-    def forward(self, input_):
+    def forward(self, input_) -> tuple[torch.Tensor, Optional[Parameter]]:
         bias = self.bias if not self.skip_bias_add else None
 
         # Matrix multiply.
@@ -422,7 +422,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
 
     def __init__(self,
                  input_size: int,
-                 output_sizes: List[int],
+                 output_sizes: list[int],
                  bias: bool = True,
                  gather_output: bool = False,
                  skip_bias_add: bool = False,
@@ -500,7 +500,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             current_shard_offset = 0
             use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
                                             False)
-            shard_offsets: List[Tuple[int, int, int]] = []
+            shard_offsets: list[tuple[int, int, int]] = []
             for i, output_size in enumerate(self.output_sizes):
                 shard_offsets.append((i, current_shard_offset, output_size))
                 current_shard_offset += output_size
@@ -602,7 +602,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         """
 
         current_shard_offset = 0
-        shard_offsets: List[Tuple[int, int, int]] = []
+        shard_offsets: list[tuple[int, int, int]] = []
         for i, output_size in enumerate(self.output_sizes):
             shard_offsets.append((i, current_shard_offset, output_size))
             current_shard_offset += output_size
@@ -1124,7 +1124,7 @@ class RowParallelLinear(LinearBase):
 
         param.load_row_parallel_weight(loaded_weight=loaded_weight)
 
-    def forward(self, input_):
+    def forward(self, input_) -> tuple[torch.Tensor, Optional[Parameter]]:
         if self.input_is_parallel:
             input_parallel = input_
         else:
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 160beaa14..dfc714382 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,7 +15,7 @@
 # limitations under the License.
 """Wrapper around `transformers` models"""
 import re
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Union
 
 import torch
 from torch import nn
@@ -71,23 +72,10 @@ def vllm_flash_attention_forward(
 ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward
 
 
-# Linear Layer that is compatible with transformers internal forward
-# TODO: This is a temporary solution, we should find a better way to integrate
-class HFColumnParallelLinear(ColumnParallelLinear):
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return super().forward(input)[0]
-
-
-class HFRowParallelLinear(RowParallelLinear):
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return super().forward(input)[0]
-
-
-def replace_tp_linear_class(orig_module: nn.Linear,
-                            style: str,
-                            quant_config=None):
+def replace_linear_class(
+        linear: nn.Linear,
+        style: str,
+        quant_config=None) -> Union[ColumnParallelLinear, RowParallelLinear]:
     """
     In model configurations, we use a neutral type (string) to specify parallel
     styles, here we use it to translate nn.Linear into vllm-style tp Linear.
@@ -99,26 +87,28 @@ def replace_tp_linear_class(orig_module: nn.Linear,
         raise ValueError(
             f"Unsupported parallel style type {type(style)}, expected str")
 
-    input_size = orig_module.in_features
-    output_size = orig_module.out_features
-    bias = orig_module.bias is not None
+    vllm_linear_cls = {
+        "colwise": ColumnParallelLinear,
+        "rowwise": RowParallelLinear,
+    }.get(style)
 
-    if style == "colwise":
-        return HFColumnParallelLinear(
-            input_size,
-            output_size,
-            bias,
-        )
-    elif style == "rowwise":
-        return HFRowParallelLinear(
-            input_size,
-            output_size,
-            bias,
-        )
-    # We don't consider colwise_rep since it's used in lm_head
-    else:
+    if vllm_linear_cls is None:
         raise ValueError(f"Unsupported parallel style value: {style}")
 
+    class HFCompatibleLinear(vllm_linear_cls):
+        """
+        Wrapper class that removes `output_bias` from returned output.
+        """
+
+        def forward(self, input: torch.Tensor) -> torch.Tensor:
+            return super().forward(input)[0]
+
+    return HFCompatibleLinear(
+        input_size=linear.in_features,
+        output_size=linear.out_features,
+        bias=linear.bias is not None,
+    )
+
 
 class TransformersModel(nn.Module):
     embedding_padding_modules = ["lm_head"]
@@ -192,16 +182,16 @@ class TransformersModel(nn.Module):
                 "support it yet!")
 
         for child_name, child_module in module.named_children():
-            qual_name = prefix + child_name
+            qual_name = maybe_prefix(prefix, child_name)
             for pattern, style in self.config.base_model_tp_plan.items():
                 if re.match(pattern, qual_name) and isinstance(
                         child_module, nn.Linear):
-                    new_module = replace_tp_linear_class(
-                        child_module, style, self.quant_config)
+                    new_module = replace_linear_class(child_module, style,
+                                                      self.quant_config)
                     setattr(module, child_name, new_module)
                     self.log_replacement(qual_name, child_module, new_module)
             else:
-                self.tensor_parallelize(child_module, prefix=f"{qual_name}.")
+                self.tensor_parallelize(child_module, prefix=qual_name)
 
     def replace_vocab_embed_class(self, module: nn.Module):
         # Use native set input embeddings
@@ -219,7 +209,7 @@ class TransformersModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],  # argument not used
+        kv_caches: list[torch.Tensor],  # argument not used
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
@@ -249,10 +239,10 @@ class TransformersModel(nn.Module):
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> Set[str]:
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
+        loaded_params = set[str]()
         for name, loaded_weight in weights:
             if name not in params_dict:
                 name = f"{self.model.base_model_prefix}.{name}"
-- 
GitLab


From 98fd089fc974313ac13370f79f4c02a3839baf4a Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 5 Feb 2025 12:44:26 +0800
Subject: [PATCH 42/65] [VLM] Add MLA with pure RoPE support for deepseek-vl2
 models (#12729)

---
 vllm/attention/backends/mla/utils.py      | 30 ++++++++++++++++++++---
 vllm/model_executor/models/deepseek_v2.py |  3 ++-
 vllm/model_executor/models/deepseek_v3.py |  3 ++-
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
index 8e584cca3..cd8c08e5a 100644
--- a/vllm/attention/backends/mla/utils.py
+++ b/vllm/attention/backends/mla/utils.py
@@ -26,7 +26,8 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     scaled_dequantize, scaled_quantize)
-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.model_executor.layers.rotary_embedding import (
+    DeepseekScalingRotaryEmbedding, RotaryEmbedding)
 
 try:
     from vllm.vllm_flash_attn import flash_attn_varlen_func
@@ -174,6 +175,8 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
         self.v_head_dim = v_head_dim
 
         self.rotary_emb = rotary_emb
+        self.use_yarn_rope = isinstance(rotary_emb,
+                                        DeepseekScalingRotaryEmbedding)
         self.q_proj = q_proj
         self.kv_b_proj = kv_b_proj
         self.o_proj = o_proj
@@ -420,6 +423,24 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
     ) -> torch.Tensor:
         raise NotImplementedError
 
+    def apply_pure_rope(
+        self,
+        input_positions: torch.Tensor,
+        q_pe: torch.Tensor,
+        k_pe: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        seq_len = input_positions.size(0)
+        ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape
+
+        q_pe, k_pe = self.rotary_emb(
+            input_positions,
+            q_pe.reshape(seq_len, -1),
+            k_pe.reshape(seq_len, -1),
+        )
+        q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape)
+
+        return q_pe, k_pe
+
     def forward(
         self,
         layer: AttentionLayer,
@@ -444,13 +465,14 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
         # Restore head dim (for rotary embedding)
         k_pe = k_pe.unsqueeze(1)
         assert hasattr(attn_metadata, "input_positions")
+        rope_fn = (self.rotary_emb
+                   if self.use_yarn_rope else self.apply_pure_rope)
 
         if is_decode:
             q_nope = self._q_proj_and_k_up_proj(hidden_states_or_q_c)
             q_pe = torch.matmul(hidden_states_or_q_c, self.W_QR)\
                 .view(-1, self.num_heads, self.qk_rope_head_dim)
-            q_pe, k_pe = \
-                self.rotary_emb(attn_metadata.input_positions, q_pe, k_pe)
+            q_pe, k_pe = rope_fn(attn_metadata.input_positions, q_pe, k_pe)
         else:
             assert is_prefill
             q = self.q_proj(hidden_states_or_q_c)[0]\
@@ -458,7 +480,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
 
             # TODO(lucas): there must be a nicer way to write this line
             q[..., self.qk_nope_head_dim:], k_pe = \
-                self.rotary_emb(
+                rope_fn(
                     attn_metadata.input_positions,
                     q[..., self.qk_nope_head_dim:], k_pe)
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index f5fede4d8..fdd584f9d 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -414,7 +414,8 @@ class DeepseekV2MLAAttention(nn.Module):
                                         quant_config=quant_config,
                                         prefix=f"{prefix}.o_proj")
 
-        rope_scaling["rope_type"] = 'deepseek_yarn'
+        if rope_scaling:
+            rope_scaling["rope_type"] = 'deepseek_yarn'
         self.rotary_emb = get_rope(qk_rope_head_dim,
                                    rotary_dim=qk_rope_head_dim,
                                    max_position=max_position_embeddings,
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
index a4829aa1a..81f82b182 100644
--- a/vllm/model_executor/models/deepseek_v3.py
+++ b/vllm/model_executor/models/deepseek_v3.py
@@ -422,7 +422,8 @@ class DeepseekV3MLAAttention(nn.Module):
                                         quant_config=quant_config,
                                         prefix=f"{prefix}.o_proj")
 
-        rope_scaling["rope_type"] = 'deepseek_yarn'
+        if rope_scaling:
+            rope_scaling["rope_type"] = 'deepseek_yarn'
         self.rotary_emb = get_rope(qk_rope_head_dim,
                                    rotary_dim=qk_rope_head_dim,
                                    max_position=max_position_embeddings,
-- 
GitLab


From 686006a22020c80fcaab2d12064302505188f577 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 4 Feb 2025 23:44:48 -0500
Subject: [PATCH 43/65] [Misc] Bump the compressed-tensors version (#12736)

---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 97e33a6db..cfa020256 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -34,6 +34,6 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.9.0 # required for compressed-tensors
+compressed-tensors == 0.9.1 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
-- 
GitLab


From 7ff7a638b66fdeca71257f245369bd7337e55d32 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 5 Feb 2025 00:32:06 -0500
Subject: [PATCH 44/65] [Model][Quant] Fix GLM, Fix fused module mappings for
 quantization (#12634)

Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
---
 .../layers/quantization/base_config.py        |   3 +-
 .../compressed_tensors/compressed_tensors.py  |  37 +++--
 .../quantization/compressed_tensors/utils.py  | 140 +++++++-----------
 .../layers/quantization/quark/quark.py        |  10 +-
 .../layers/quantization/quark/utils.py        |  17 ++-
 .../layers/quantization/utils/quant_utils.py  |  26 ++--
 vllm/model_executor/model_loader/loader.py    |   4 +
 vllm/model_executor/model_loader/utils.py     |  22 +++
 vllm/model_executor/models/chatglm.py         |  26 +++-
 .../models/glm4_vision_encoder.py             |  31 ++--
 vllm/model_executor/models/minicpmv.py        |  14 +-
 vllm/model_executor/models/qwen.py            |  16 +-
 12 files changed, 195 insertions(+), 151 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 2eefcc4f3..c0d8553c0 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -2,7 +2,7 @@
 
 import inspect
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional, Type
+from typing import Any, Dict, List, Mapping, Optional, Type
 
 import torch
 from torch import nn
@@ -59,6 +59,7 @@ def method_has_implemented_embedding(
 
 class QuantizationConfig(ABC):
     """Base class for quantization configs."""
+    packed_modules_mapping: Mapping[str, List[str]] = dict()
 
     @abstractmethod
     def get_name(self) -> str:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 1a11b2419..0e3258e4a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -83,7 +83,9 @@ class CompressedTensorsConfig(QuantizationConfig):
 
         # Check if the layer is skipped for quantization.
         # TODO (@robertgshaw2): support module names
-        if should_ignore_layer(prefix, ignore=self.ignore):
+        if should_ignore_layer(prefix,
+                               ignore=self.ignore,
+                               fused_mapping=self.packed_modules_mapping):
             return UnquantizedLinearMethod()
         if isinstance(layer, LinearBase):
             scheme = self.get_scheme(layer=layer, layer_name=prefix)
@@ -379,34 +381,29 @@ class CompressedTensorsConfig(QuantizationConfig):
 
         # Will be empty for models with only sparsity
         weight_quant = input_quant = None
-        sparsity_scheme: Optional[SparsityCompressionConfig] = None
         if self.target_scheme_map:
             matched_target = find_matched_target(
                 layer_name=layer_name,
                 module=layer,
-                targets=self.target_scheme_map.keys())
+                targets=self.target_scheme_map.keys(),
+                fused_mapping=self.packed_modules_mapping)
 
             scheme_dict = self.target_scheme_map[matched_target]
             weight_quant = scheme_dict.get("weights")
             input_quant = scheme_dict.get("input_activations")
 
-        if self.sparsity_scheme_map:
-            is_ignored = False
-            with suppress(ValueError):
-                is_ignored = find_matched_target(
-                    layer_name=layer_name,
-                    module=layer,
-                    targets=self.sparsity_ignore_list)
-
-            # if the layer is in the sparsity ignore list,
-            # we should not apply any sparsity scheme
-
-            if not is_ignored:
-                matched_target = find_matched_target(
-                    layer_name=layer_name,
-                    module=layer,
-                    targets=self.sparsity_scheme_map.keys())
-                sparsity_scheme = self.sparsity_scheme_map.get(matched_target)
+        # Find the sparsity scheme of the layer
+        # assume that fused layers inerhit first component's sparsity scheme
+        sparsity_targets = (self.sparsity_scheme_map.keys() -
+                            set(self.sparsity_ignore_list))
+        sparsity_scheme: Optional[SparsityCompressionConfig] = None
+        with suppress(ValueError):
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=sparsity_targets,
+                fused_mapping=self.packed_modules_mapping)
+            sparsity_scheme = self.sparsity_scheme_map[matched_target]
 
         if self.supports_cutlass_24(weight_quant=weight_quant,
                                     input_quant=input_quant,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 4ea79531e..85ae1d5cb 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -1,14 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import re
-from typing import Iterable, Optional
+from types import MappingProxyType
+from typing import Iterable, List, Mapping, Optional
 
 from compressed_tensors import CompressionFormat
 from torch.nn import Module
 
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    FUSED_LAYER_NAME_MAPPING)
-
 
 def is_activation_quantization_format(format: str) -> bool:
     _ACTIVATION_QUANTIZATION_FORMATS = [
@@ -19,8 +17,11 @@ def is_activation_quantization_format(format: str) -> bool:
     return format in _ACTIVATION_QUANTIZATION_FORMATS
 
 
-def should_ignore_layer(layer_name: Optional[str],
-                        ignore: Iterable[str]) -> bool:
+def should_ignore_layer(
+    layer_name: Optional[str],
+    ignore: Iterable[str] = tuple(),
+    fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
+) -> bool:
     if layer_name is None:
         return False
 
@@ -32,8 +33,8 @@ def should_ignore_layer(layer_name: Optional[str],
     # in the safetensors checkpoint. So, we convert the name
     # from the fused version to unfused + check to make sure that
     # each shard of the fused layer has the same scheme.
-    if proj_name in FUSED_LAYER_NAME_MAPPING and layer_name not in ignore:
-        shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]
+    if proj_name in fused_mapping and layer_name not in ignore:
+        shard_proj_names = fused_mapping[proj_name]
 
         # Convert fused_name --> [shard_names]
         shard_names = [
@@ -79,55 +80,12 @@ def check_equal_or_regex_match(layer_name: str,
     return False
 
 
-def _handle_fused_layers(func):
-    """
-    Decorator to handle fused layers by mapping vllm fused layer names
-    to their corresponding unfused layer names for quantization/pruning schemes.
-    """
-    # fused_layer_name -> unfused_layer_name
-    fused_layer_map = {
-        "qkv_proj": "q_proj",
-        "gate_up_proj": "up_proj",
-    }
-
-    def fused_layer_handler(layer_name: Optional[str], module: Module,
-                            targets: Iterable[str]) -> Optional[str]:
-        """
-        Wrapper function specifically designed to support the
-        find_matched_target function.
-
-        It handles cases where the provided layer name corresponds to a
-        fused layer in vllm, mapping it to its equivalent unfused layer name
-        based on the predefined fused_layer_map. If the original layer name
-        raises a ValueError in the wrapped function, this handler
-        will attempt to resolve the issue by substituting with unfused
-        layer name.
-
-        :param layer_name: Name of the layer, which may be fused.
-        :param module: An instance of torch.nn.Module.
-        :param targets: A list of target names or patterns to match.
-        :return: The result of the wrapped find_matched_target function with
-            the resolved layer name.
-        :raises ValueError: If the layer name cannot be resolved to a 
-            valid target.
-        """
-        try:
-            return func(layer_name, module, targets)
-        except ValueError:
-            if layer_name is None:
-                layer_name = ""
-            parent_name, fused_proj_name = layer_name.rsplit(".", 1)
-            unfused_proj_name = fused_layer_map.get(fused_proj_name,
-                                                    fused_proj_name)
-            new_layer_name = f"{parent_name}.{unfused_proj_name}"
-            return func(new_layer_name, module, targets)
-
-    return fused_layer_handler
-
-
-@_handle_fused_layers
-def find_matched_target(layer_name: Optional[str], module: Module,
-                        targets: Iterable[str]) -> str:
+def find_matched_target(
+    layer_name: Optional[str],
+    module: Module,
+    targets: Iterable[str],
+    fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
+) -> str:
     """
     Helper function to look up which "target" in the compressed-tensors
     config that a layer corresponds to.
@@ -141,19 +99,25 @@ def find_matched_target(layer_name: Optional[str], module: Module,
 
     First, we try to match the layer_name with a target
     Second, we try to match the module's name with a target
+    Third, we try to map the layer_name to a list of fused module names.
+        *All* component module names must match in order for a match to be
+        successful. A successful match returns the first component target
 
     :param layer_name: layer name
     :param module: torch.nn.Module
     :param targets: list of targets to match the layer against
+    :param fused_mapping: map from fused layer names to its components
+    :param fused_strategy: either "all" or "any". If using "all", fused
+        layers match if "all" of its components match
     """
 
     if layer_name is None:
         layer_name = ""
 
-    matched_target = (_find_first_match(layer_name, targets)
-                      or _find_first_match(module.__class__.__name__, targets,
-                                           True)
-                      or _match_fused_layer(layer_name, targets))
+    matched_target = (
+        _find_first_match(layer_name, targets)
+        or _find_first_match(module.__class__.__name__, targets, True)
+        or _match_fused_layer(layer_name, targets, fused_mapping))
 
     if matched_target is None:
         raise ValueError(
@@ -205,11 +169,19 @@ def _is_equal_or_regex_match(value: str,
     return False
 
 
-def _match_fused_layer(layer_name: str,
-                       target_layers: Iterable[str]) -> Optional[str]:
+def _match_fused_layer(
+        layer_name: str, target_layers: Iterable[str],
+        fused_mapping: Mapping[str, List[str]]) -> Optional[str]:
     """
     Match a fused layer name to its corresponding individual layer in 
-    target_layers.
+    target_layers. Returns first value in fused_mapping which matches targets
+
+    Implements an "all" matching strategy where a fused layer matches iff
+    "all" of its components match
+
+    :param layer_name: layer name
+    :param target_layers: list of targets to match the layer against
+    :param fused_mapping: map from fused layer names to its components
 
     Examples:
         layer_name = "model.layers.0.self_attn.qkv_proj"
@@ -217,27 +189,25 @@ def _match_fused_layer(layer_name: str,
                         "model.layers.0.self_attn.k_proj",
                         "model.layers.0.self_attn.v_proj"]
     """
-    # Split into parent path and layer type
-    # e.g., "model.layers.0.self_attn" and "qkv_proj"
-    parent_path = ".".join(layer_name.split(".")[:-1])
-    layer_type = layer_name.split(".")[-1]
-
-    if layer_type not in FUSED_LAYER_NAME_MAPPING:
+    # find layer_name in mapping
+    fused = next((key for key in fused_mapping if layer_name.endswith(key)),
+                 None)
+    if fused is None:
         return None
 
-    possible_layer_types = FUSED_LAYER_NAME_MAPPING[layer_type]
-
-    # Look for a target layer that:
-    # 1. Has the same parent path
-    # 2. Ends with one of the possible individual layer types
-    for target in target_layers:
-        is_same_parent = parent_path in target
-        is_matching_type = any(type_suffix in target
-                               for type_suffix in possible_layer_types)
-
-        if is_same_parent and is_matching_type and all(
-            (f"{parent_path}.{type_suffix}" in target_layers)
-                for type_suffix in possible_layer_types):
-            return target
+    # expand path of unfused components
+    unfused_paths = [
+        layer_name.replace(fused, unfused) for unfused in fused_mapping[fused]
+    ]
 
-    return None
+    # for each unfused component, find a match in targets
+    unfused_matches: List[Optional[str]] = []
+    for unfused in unfused_paths:
+        for target in target_layers:
+            if _is_equal_or_regex_match(unfused, target):
+                unfused_matches.append(target)
+                break
+        else:
+            unfused_matches.append(None)
+
+    return unfused_matches[0] if all(unfused_matches) else None
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index 0451cf82b..ba123565a 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -18,8 +18,6 @@ from vllm.model_executor.layers.quantization.quark.schemes import (
     QuarkScheme, QuarkW8A8Fp8, QuarkW8A8Int8)
 from vllm.model_executor.layers.quantization.quark.utils import (
     deep_compare, should_ignore_layer)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    FUSED_LAYER_NAME_MAPPING)
 from vllm.platforms import current_platform
 
 __all__ = ["QuarkLinearMethod"]
@@ -58,7 +56,9 @@ class QuarkConfig(QuantizationConfig):
 
         # Check if the layer is skipped for quantization.
         exclude_layers = cast(List[str], self.quant_config.get("exclude"))
-        if should_ignore_layer(prefix, ignore=exclude_layers):
+        if should_ignore_layer(prefix,
+                               ignore=exclude_layers,
+                               fused_mapping=self.packed_modules_mapping):
             return UnquantizedLinearMethod()
         if isinstance(layer, LinearBase):
             scheme = self.get_scheme(layer=layer, layer_name=prefix)
@@ -201,8 +201,8 @@ class QuarkConfig(QuantizationConfig):
                              module: torch.nn.Module) -> Dict[str, Any]:
 
         proj_name = layer_name.split(".")[-1]
-        if proj_name in FUSED_LAYER_NAME_MAPPING:
-            shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]
+        if proj_name in self.packed_modules_mapping:
+            shard_proj_names = self.packed_modules_mapping[proj_name]
 
             # Convert fused_name --> [shard_names]
             shard_names = [
diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py
index afb1d9d63..17e0df021 100644
--- a/vllm/model_executor/layers/quantization/quark/utils.py
+++ b/vllm/model_executor/layers/quantization/quark/utils.py
@@ -1,10 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import re
-from typing import Any, Iterable, Optional
-
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    FUSED_LAYER_NAME_MAPPING)
+from types import MappingProxyType
+from typing import Any, Iterable, List, Mapping, Optional
 
 
 def deep_compare(dict1: Any, dict2: Any) -> bool:
@@ -20,8 +18,11 @@ def deep_compare(dict1: Any, dict2: Any) -> bool:
         return dict1 == dict2
 
 
-def should_ignore_layer(layer_name: Optional[str],
-                        ignore: Iterable[str]) -> bool:
+def should_ignore_layer(
+    layer_name: Optional[str],
+    ignore: Iterable[str],
+    fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
+) -> bool:
     if layer_name is None:
         return False
 
@@ -33,8 +34,8 @@ def should_ignore_layer(layer_name: Optional[str],
     # in the safetensors checkpoint. So, we convert the name
     # from the fused version to unfused + check to make sure that
     # each shard of the fused layer has the same scheme.
-    if proj_name in FUSED_LAYER_NAME_MAPPING:
-        shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]
+    if proj_name in fused_mapping:
+        shard_proj_names = fused_mapping[proj_name]
 
         # Convert fused_name --> [shard_names]
         shard_names = [
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 62484f62f..c7ce3a42c 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """This file is used for /tests and /benchmarks"""
-from typing import List, Optional, Tuple
+from types import MappingProxyType
+from typing import List, Mapping, Optional, Tuple
 
 import numpy
 import torch
@@ -12,14 +13,6 @@ from vllm.scalar_type import ScalarType, scalar_types
 SUPPORTED_GPTQ_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
 SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
 
-# Note: this is a hack. We should update each model to register the
-# stacked params and get it from there instead in a future PR.
-# fused_name: List[shard_name]
-FUSED_LAYER_NAME_MAPPING = {
-    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-    "gate_up_proj": ["gate_proj", "up_proj"]
-}
-
 
 # Normalize the group_shape to the full extent for any dims that are -1
 def _normalize_quant_group_shape(x: torch.Tensor, group_shape: Tuple[int,
@@ -178,14 +171,23 @@ def unpack_quantized_values_into_int32(w_q: torch.Tensor,
     return res.permute(inv_perm)
 
 
-def is_layer_skipped(prefix: str, ignored_layers: List[str]) -> bool:
+def is_layer_skipped(
+    prefix: str,
+    ignored_layers: List[str],
+    fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
+) -> bool:
     # prefix: model.layers.0.self_attn.q_proj
     # proj_name: q_proj
     proj_name = prefix.split(".")[-1]
-    if proj_name in FUSED_LAYER_NAME_MAPPING:
+
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    if proj_name in fused_mapping:
         shard_prefixes = [
             prefix.replace(proj_name, shard_proj_name)
-            for shard_proj_name in FUSED_LAYER_NAME_MAPPING[proj_name]
+            for shard_proj_name in fused_mapping[proj_name]
         ]
 
         is_skipped = None
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 19e3bc6a2..2a2c2523b 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -43,6 +43,7 @@ from vllm.model_executor.model_loader.tensorizer import (
     TensorizerConfig, is_vllm_tensorized, load_with_tensorizer,
     serialize_vllm_model, tensorizer_weights_iterator)
 from vllm.model_executor.model_loader.utils import (ParamMapping,
+                                                    configure_quant_config,
                                                     get_model_architecture,
                                                     set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -113,6 +114,9 @@ def _initialize_model(
     model_config = vllm_config.model_config
     model_class, _ = get_model_architecture(model_config)
 
+    if vllm_config.quant_config is not None:
+        configure_quant_config(vllm_config.quant_config, model_class)
+
     signatures = inspect.signature(model_class.__init__)
     all_params = [param.name for param in signatures.parameters.values()]
     if "vllm_config" in all_params and "prefix" in all_params:
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 7a82a695c..dc620d498 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -11,6 +11,8 @@ from transformers.dynamic_module_utils import get_class_from_dynamic_module
 
 from vllm.config import ModelConfig, ModelImpl
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
 from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.models.adapters import (as_classification_model,
                                                  as_embedding_model,
@@ -138,3 +140,23 @@ class ParamMapping:
             if module_name.endswith(key):
                 return key, value
         return None
+
+
+def configure_quant_config(quant_config: QuantizationConfig,
+                           model_class: Type[nn.Module]):
+    """
+    Pass packed_modules_mapping by reference to quant_config so that
+    quant_config can properly match fused modules
+
+    Note that model attributes are passed by reference to quant_config,
+    enabling them to be updated by model_class.__new__ (ex. chatglm, qwen)
+    """
+    packed_mapping = getattr(model_class, "packed_modules_mapping", None)
+    if packed_mapping is not None:
+        # pass packed_modules_mapping by reference to quant_config
+        quant_config.packed_modules_mapping = packed_mapping
+    else:
+        logger.warning(
+            "The model class %s has not defined `packed_modules_mapping`, "
+            "this may lead to incorrect mapping of quantized or ignored "
+            "modules", model_class.__name__)
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index b81a9e917..a31648675 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -265,12 +265,14 @@ class GLMAttention(nn.Module):
             self.total_num_kv_heads,
             bias=config.add_bias_linear or config.add_qkv_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
         )
         self.dense = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             config.hidden_size,
             bias=config.add_bias_linear,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense",
         )
 
         # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
@@ -327,6 +329,7 @@ class GLMMLP(nn.Module):
         self,
         config: ChatGLMConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
 
@@ -338,6 +341,7 @@ class GLMMLP(nn.Module):
             [config.ffn_hidden_size] * 2,
             bias=config.add_bias_linear,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense_h_to_4h",
         )
 
         self.activation_func = SiluAndMul()
@@ -348,6 +352,7 @@ class GLMMLP(nn.Module):
             config.hidden_size,
             bias=config.add_bias_linear,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h",
         )
 
     def forward(self, hidden_states):
@@ -396,7 +401,7 @@ class GLMBlock(nn.Module):
             config.hidden_size, eps=config.layernorm_epsilon)
 
         # MLP
-        self.mlp = GLMMLP(config, quant_config)
+        self.mlp = GLMMLP(config, quant_config, prefix=f"{prefix}.mlp")
 
     def forward(
         self,
@@ -507,7 +512,8 @@ class ChatGLMModel(nn.Module):
 
         self.embedding = VocabParallelEmbedding(config.padded_vocab_size,
                                                 config.hidden_size,
-                                                quant_config=quant_config)
+                                                quant_config=quant_config,
+                                                prefix=f"{prefix}.embedding")
 
         self.num_layers = config.num_layers
         self.multi_query_group_num = config.multi_query_group_num
@@ -766,6 +772,7 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
                          SupportsMultiModal):
     # Ensure that the LoRA support check passes when the class is not
     # initialized, but set all these attributes to empty.
+    # These will be updated when an instance class is selected
     packed_modules_mapping = {}
     supported_lora_modules = []
     embedding_modules = {}
@@ -777,9 +784,18 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
         prefix: str = "",
     ) -> None:
         config = vllm_config.model_config.hf_config
+
         # Initialize VL
-        if hasattr(config, "vision_config"):
-            return ChatGLMV(vllm_config=vllm_config, prefix=prefix)
+        if hasattr(config, "vision_config"):  # noqa: SIM108
+            instance_cls = ChatGLMV
         # Initialize LLM
         else:
-            return ChatGLM(vllm_config=vllm_config, prefix=prefix)
\ No newline at end of file
+            instance_cls = ChatGLM
+
+        # quant_config references base class members,
+        # so update values before init is called
+        cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
+        cls.supported_lora_modules += instance_cls.supported_lora_modules
+        cls.embedding_modules.update(instance_cls.embedding_modules)
+        cls.embedding_padding_modules += instance_cls.embedding_padding_modules
+        return instance_cls(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py
index 4449eb8e8..2facd1353 100644
--- a/vllm/model_executor/models/glm4_vision_encoder.py
+++ b/vllm/model_executor/models/glm4_vision_encoder.py
@@ -74,11 +74,13 @@ class Attention(nn.Module):
             self.head_dim,
             config.num_heads,
             quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
         )
         self.dense = RowParallelLinear(
             config.hidden_size,
             config.hidden_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense",
         )
 
         self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
@@ -101,6 +103,7 @@ class MLP(nn.Module):
         self,
         config,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
     ):
         super().__init__()
         self.config = config
@@ -109,11 +112,13 @@ class MLP(nn.Module):
             config.hidden_size,
             config.intermediate_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
         )
         self.fc2 = RowParallelLinear(
             config.intermediate_size,
             config.hidden_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -137,7 +142,9 @@ class TransformerLayer(nn.Module):
         self.attention = Attention(config,
                                    quant_config=quant_config,
                                    prefix=f"{prefix}.attention")
-        self.mlp = MLP(config, quant_config=quant_config)
+        self.mlp = MLP(config,
+                       quant_config=quant_config,
+                       prefix=f"{prefix}.mlp")
         self.post_attention_layernorm = LayerNorm(config.hidden_size,
                                                   eps=config.layer_norm_eps)
 
@@ -164,7 +171,7 @@ class Transformer(nn.Module):
         self.layers = nn.ModuleList([
             TransformerLayer(config,
                              quant_config=quant_config,
-                             prefix=f"{prefix}.layer.{layer_idx}")
+                             prefix=f"{prefix}.layers.{layer_idx}")
             for layer_idx in range(config.num_hidden_layers)
         ])
 
@@ -181,6 +188,7 @@ class GLU(nn.Module):
         config,
         in_features,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
     ):
         """
         The original implementation is the same as:
@@ -222,7 +230,8 @@ class GLU(nn.Module):
         self.linear_proj = ReplicatedLinear(in_features,
                                             config.hidden_size,
                                             bias=False,
-                                            quant_config=quant_config)
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.linear_proj")
         self.norm1 = nn.LayerNorm(config.hidden_size)
         self.act1 = nn.GELU()
         self.act2 = SiluAndMul()
@@ -230,12 +239,15 @@ class GLU(nn.Module):
         self.merged_proj = MergedColumnParallelLinear(
             config.hidden_size, [config.ffn_hidden_size] * 2,
             bias=False,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.merged_proj")
 
-        self.dense_4h_to_h = RowParallelLinear(config.ffn_hidden_size,
-                                               config.hidden_size,
-                                               bias=False,
-                                               quant_config=quant_config)
+        self.dense_4h_to_h = RowParallelLinear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h")
 
     def forward(self, x):
         x, _ = self.linear_proj(x)
@@ -262,7 +274,8 @@ class EVA2CLIPModel(nn.Module):
                                        prefix=f"{prefix}.transformer")
         self.linear_proj = GLU(config,
                                in_features=config.hidden_size,
-                               quant_config=quant_config)
+                               quant_config=quant_config,
+                               prefix=f"{prefix}.linear_proj")
         self.conv = nn.Conv2d(in_channels=vision_config.hidden_size,
                               out_channels=config.hidden_size,
                               kernel_size=2,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 3d16d635b..20f3a3d19 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1473,6 +1473,7 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
     """
     # Ensure that the LoRA support check passes when the class is not
     # initialized, but set all these attributes to empty.
+    # These will be updated when an instance class is selected
     packed_modules_mapping = {}
     supported_lora_modules = []
     embedding_modules = {}
@@ -1489,8 +1490,15 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
             version = str(config.version).split(".")
             version = tuple([int(x) for x in version])
         # Dispatch class based on version
-        instance_class = _SUPPORT_VERSION.get(version)
-        if instance_class is None:
+        instance_cls = _SUPPORT_VERSION.get(version)
+        if instance_cls is None:
             raise ValueError(
                 "Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6")
-        return instance_class(vllm_config=vllm_config, prefix=prefix)
+
+        # quant_config references base class members,
+        # so update values before init is called
+        cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
+        cls.supported_lora_modules += instance_cls.supported_lora_modules
+        cls.embedding_modules.update(instance_cls.embedding_modules)
+        cls.embedding_padding_modules += instance_cls.embedding_padding_modules
+        return instance_cls(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 327fad0f5..897066124 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -1135,6 +1135,7 @@ class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA):
     """
     # Ensure that the LoRA support check passes when the class is not
     # initialized, but set all these attributes to empty.
+    # These will be updated when an instance class is selected
     packed_modules_mapping = {}
     supported_lora_modules = []
     embedding_modules = {}
@@ -1146,9 +1147,18 @@ class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA):
         prefix: str = "",
     ) -> QWenBaseModel:
         config = vllm_config.model_config.hf_config
+
         # Initialize VL
-        if hasattr(config, "visual"):
-            return QWenVL(vllm_config=vllm_config, prefix=prefix)
+        if hasattr(config, "visual"):  # noqa: SIM108
+            instance_cls = QWenVL
         # Initialize LLM
         else:
-            return QWenLLM(vllm_config=vllm_config, prefix=prefix)
+            instance_cls = QWenLLM
+
+        # quant_config references base class members,
+        # so update values before init is called
+        cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
+        cls.supported_lora_modules += instance_cls.supported_lora_modules
+        cls.embedding_modules.update(instance_cls.embedding_modules)
+        cls.embedding_padding_modules += instance_cls.embedding_padding_modules
+        return instance_cls(vllm_config=vllm_config, prefix=prefix)
-- 
GitLab


From 58b218d7ae91340a70a2a961d03f6e49315c2cfa Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 5 Feb 2025 01:42:09 -0500
Subject: [PATCH 45/65] [Doc] Update PR Reminder with link to Developer Slack
 (#12748)

---
 .github/workflows/reminder_comment.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index df62539c0..27318c2fd 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -2,7 +2,6 @@ name: PR Reminder Comment Bot
 on:
   pull_request_target:
     types: [opened]
-
 jobs:
   pr_reminder:
     runs-on: ubuntu-latest
@@ -15,7 +14,12 @@ jobs:
               owner: context.repo.owner,
               repo: context.repo.repo,
               issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
+                '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
+                'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
+                'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
+                'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
+                '🚀'
             })
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-- 
GitLab


From fcf2e3d7fcc9898b7a1b26bacea22753ab76f3a6 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 5 Feb 2025 06:42:46 +0000
Subject: [PATCH 46/65] [Bugfix] Fix OpenVINO model runner (#12750)

---
 vllm/attention/backends/openvino.py          |  4 ++++
 vllm/model_executor/model_loader/openvino.py | 11 +++++------
 vllm/worker/openvino_model_runner.py         |  9 +++------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py
index f58528dbf..9908620a3 100644
--- a/vllm/attention/backends/openvino.py
+++ b/vllm/attention/backends/openvino.py
@@ -140,3 +140,7 @@ class OpenVINOAttentionMetadata:
     # `model_executable`.
     multi_modal_placeholder_index_maps: Optional[Dict[
         str, MultiModalPlaceholderMap.IndexMap]]
+
+    # Enable/disable KV scales calculation. This is so that we can disable the
+    # calculation until after prefill and cuda graph capture.
+    enable_kv_scales_calculation: bool
diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py
index 7bd531c56..fde200d57 100644
--- a/vllm/model_executor/model_loader/openvino.py
+++ b/vllm/model_executor/model_loader/openvino.py
@@ -13,7 +13,7 @@ from torch import nn
 
 import vllm.envs as envs
 from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
-from vllm.config import DeviceConfig, ModelConfig
+from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import (LogitsProcessor,
                                                          _prune_hidden_states)
@@ -103,7 +103,6 @@ class OpenVINOCausalLM(nn.Module):
         self,
         ov_core: ov.Core,
         model_config: ModelConfig,
-        device_config: DeviceConfig,
         kv_cache_dtype: ov.Type,
     ) -> None:
         super().__init__()
@@ -187,8 +186,7 @@ class OpenVINOCausalLM(nn.Module):
 
 
 def get_model(
-    model_config: ModelConfig,
-    device_config: DeviceConfig,
+    vllm_config: VllmConfig,
     kv_cache_dtype: ov.Type,
     **kwargs,
 ) -> torch.nn.Module:
@@ -201,5 +199,6 @@ def get_model(
             "be added in the future. If this is important to you, "
             "please open an issue on github.")
 
-    return OpenVINOCausalLM(ov_core, model_config, device_config,
-                            kv_cache_dtype)
+    with set_current_vllm_config(vllm_config):
+        return OpenVINOCausalLM(ov_core, vllm_config.model_config,
+                                kv_cache_dtype)
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 44442cddb..f7a5ab9de 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -54,15 +54,13 @@ class OpenVINOModelRunner(ModelRunnerBase):
     ):
         self.ov_core = ov_core
         ModelRunnerBase.__init__(self, vllm_config=vllm_config)
-        cache_config = self.cache_config
-        model_config = self.model_config
         self.is_driver_worker = is_driver_worker
 
         self.device = self.device_config.device
 
         self.kv_cache_dtype = kv_cache_dtype
-        self.sliding_window = model_config.get_sliding_window()
-        self.block_size = cache_config.block_size
+        self.sliding_window = self.model_config.get_sliding_window()
+        self.block_size = self.cache_config.block_size
 
         self.attn_backend = get_attn_backend(
             self.model_config.get_head_size(),
@@ -81,8 +79,7 @@ class OpenVINOModelRunner(ModelRunnerBase):
         self.model: nn.Module  # Set after init_Model
 
     def load_model(self) -> None:
-        self.model = get_model(model_config=self.model_config,
-                               device_config=self.device_config,
+        self.model = get_model(vllm_config=self.vllm_config,
                                kv_cache_dtype=self.kv_cache_dtype,
                                ov_core=self.ov_core)
 
-- 
GitLab


From 3d09e592a860982acef0edef858078d28d393e84 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Tue, 4 Feb 2025 22:43:02 -0800
Subject: [PATCH 47/65] [V1][Misc] Shorten `FinishReason` enum and use constant
 strings (#12760)

---
 vllm/v1/engine/__init__.py    | 12 +++++++++---
 vllm/v1/engine/detokenizer.py |  7 +++----
 vllm/v1/metrics/loggers.py    |  6 +++---
 vllm/v1/metrics/stats.py      |  7 +++----
 vllm/v1/request.py            | 14 +++++++-------
 5 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 6bd548bdc..d5933cac5 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -14,11 +14,17 @@ if TYPE_CHECKING:
     from vllm.multimodal.inputs import PlaceholderRange
     from vllm.sampling_params import SamplingParams
 
+# These are possible values of RequestOutput.finish_reason,
+# so form part of the external API.
+FINISH_REASON_STRINGS = ("stop", "length", "abort")
 
-class RequestFinishedReason(enum.IntEnum):
+
+class FinishReason(enum.IntEnum):
     """
     Reason a request finished - stop, length, or abort.
 
+    Int rather than Str for more compact serialization.
+
     stop - a stop string was emitted
     length - max_tokens was consumed, or max_model_len was reached
     abort - aborted for another reason
@@ -29,7 +35,7 @@ class RequestFinishedReason(enum.IntEnum):
     ABORT = 2
 
     def __str__(self):
-        return self.name.lower()
+        return FINISH_REASON_STRINGS[self.value]
 
 
 @dataclass
@@ -62,7 +68,7 @@ class EngineCoreOutput(
     request_id: str
     new_token_ids: List[int]
     finished: bool
-    finish_reason: Optional[RequestFinishedReason] = None
+    finish_reason: Optional[FinishReason] = None
     stop_reason: Union[int, str, None] = None
 
 
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 2bce23e68..861fcb012 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -8,8 +8,7 @@ from vllm.logger import init_logger
 from vllm.sampling_params import RequestOutputKind
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
-from vllm.v1.engine import (EngineCoreOutput, EngineCoreRequest,
-                            RequestFinishedReason)
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
 
 logger = init_logger(__name__)
 
@@ -19,7 +18,7 @@ class DetokenizerOutput:
     output_text: str
     token_ids: List[int]
     finished: bool
-    finish_reason: Optional[RequestFinishedReason] = None
+    finish_reason: Optional[FinishReason] = None
     stop_reason: Union[int, str, None] = None
 
 
@@ -148,7 +147,7 @@ class IncrementalDetokenizer:
                 stop_str, truncate_to = stop
                 if truncate_to != -1:
                     self.output_text = self.output_text[:truncate_to]
-                finish_reason = RequestFinishedReason.STOP
+                finish_reason = FinishReason.STOP
                 stop_reason = stop_str
 
         # TODO: handle stop_token_ids here too?
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index b62351a8f..eb1acf584 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -9,7 +9,7 @@ import prometheus_client
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
-from vllm.v1.engine import RequestFinishedReason
+from vllm.v1.engine import FinishReason
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
@@ -117,13 +117,13 @@ class PrometheusStatLogger(StatLoggerBase):
             documentation="Number of generation tokens processed.",
             labelnames=labelnames).labels(*labelvalues)
 
-        self.counter_request_success: Dict[RequestFinishedReason,
+        self.counter_request_success: Dict[FinishReason,
                                            prometheus_client.Counter] = {}
         counter_request_success_base = prometheus_client.Counter(
             name="vllm:request_success_total",
             documentation="Count of successfully processed requests.",
             labelnames=labelnames + ["finished_reason"])
-        for reason in RequestFinishedReason:
+        for reason in FinishReason:
             self.counter_request_success[
                 reason] = counter_request_success_base.labels(*(labelvalues +
                                                                 [str(reason)]))
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 36c95e07d..e3f1efcc9 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List
 
 if TYPE_CHECKING:
     from vllm.outputs import RequestOutput
-    from vllm.v1.engine import EngineCoreOutput, RequestFinishedReason
+    from vllm.v1.engine import EngineCoreOutput, FinishReason
 
 
 @dataclass
@@ -32,7 +32,7 @@ class RequestStateStats:
 class FinishedRequestStats:
     """Stats associated with a finished request."""
 
-    finish_reason: "RequestFinishedReason"
+    finish_reason: "FinishReason"
     num_prompt_tokens: int = 0
     num_generation_tokens: int = 0
 
@@ -74,8 +74,7 @@ class IterationStats:
         request_state_stats.num_generation_tokens += num_new_generation_tokens
         request_state_stats.last_token_time = now
 
-    def update_from_finished_request(self,
-                                     finish_reason: "RequestFinishedReason",
+    def update_from_finished_request(self, finish_reason: "FinishReason",
                                      request_output: "RequestOutput",
                                      request_state_stats: RequestStateStats):
         self.finished_requests.append(
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index eb9bf99b4..89b39ea61 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List, Optional, Union
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import RequestMetrics
-from vllm.v1.engine import EngineCoreRequest, RequestFinishedReason
+from vllm.v1.engine import EngineCoreRequest, FinishReason
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
@@ -109,7 +109,7 @@ class Request:
     def is_finished(self) -> bool:
         return RequestStatus.is_finished(self.status)
 
-    def get_finished_reason(self) -> Union[RequestFinishedReason, None]:
+    def get_finished_reason(self) -> Union[FinishReason, None]:
         return RequestStatus.get_finished_reason(self.status)
 
     def has_encoder_inputs(self) -> bool:
@@ -150,7 +150,7 @@ class RequestStatus(enum.IntEnum):
 
     @staticmethod
     def get_finished_reason(
-            status: "RequestStatus") -> Union[RequestFinishedReason, None]:
+            status: "RequestStatus") -> Union[FinishReason, None]:
         return _FINISHED_REASON_MAP.get(status)
 
 
@@ -159,8 +159,8 @@ class RequestStatus(enum.IntEnum):
 # are longer than the model's length cap. Therefore, the stop
 # reason should also be "length" as in OpenAI API.
 _FINISHED_REASON_MAP = {
-    RequestStatus.FINISHED_STOPPED: RequestFinishedReason.STOP,
-    RequestStatus.FINISHED_LENGTH_CAPPED: RequestFinishedReason.LENGTH,
-    RequestStatus.FINISHED_ABORTED: RequestFinishedReason.ABORT,
-    RequestStatus.FINISHED_IGNORED: RequestFinishedReason.LENGTH,
+    RequestStatus.FINISHED_STOPPED: FinishReason.STOP,
+    RequestStatus.FINISHED_LENGTH_CAPPED: FinishReason.LENGTH,
+    RequestStatus.FINISHED_ABORTED: FinishReason.ABORT,
+    RequestStatus.FINISHED_IGNORED: FinishReason.LENGTH,
 }
-- 
GitLab


From c53dc466b1b802d45d0b61cce36908334ea7a23e Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 5 Feb 2025 01:43:11 -0500
Subject: [PATCH 48/65] [Doc] Remove performance warning for auto_awq.md
 (#12743)

---
 docs/source/features/quantization/auto_awq.md | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
index 30735b116..fa0bebeb8 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -2,12 +2,6 @@
 
 # AutoAWQ
 
-:::{warning}
-Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better
-accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency
-inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version.
-:::
-
 To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
 Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
 The main benefits are lower latency and memory usage.
-- 
GitLab


From 022bcc701a948f96e68af678eee686837f393d07 Mon Sep 17 00:00:00 2001
From: Akash kaothalkar <61960177+Akashcodes732@users.noreply.github.com>
Date: Wed, 5 Feb 2025 12:41:02 +0530
Subject: [PATCH 49/65] [Bugfix] Fix 'ModuleNotFoundError: No module named
 'intel_extension_for_pytorch'' for --tensor-parallel-size more than 1 
 (#12546)

---
 vllm/distributed/parallel_state.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index c5c5dfbba..321902d11 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -329,9 +329,17 @@ class GroupCoordinator:
             return input_
 
         if input_.is_cpu:
-            import intel_extension_for_pytorch as ipex
-            ipex.distributed.all_reduce(input_, group=self.device_group)
-            return input_
+            try:
+                import intel_extension_for_pytorch as ipex
+                ipex.distributed.all_reduce(input_, group=self.device_group)
+                return input_
+            except ImportError:
+                """
+                Intel IPEX not found. Falling back to PyTorch native 
+                all_reduce for CPU
+                """
+                torch.distributed.all_reduce(input_, group=self.device_group)
+                return input_
 
         if self.tpu_communicator is not None and \
             not self.tpu_communicator.disabled:
-- 
GitLab


From bc1bdecebf76cca0dfafe4924d529b30c8a24795 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 6 Feb 2025 02:03:19 +0800
Subject: [PATCH 50/65] [core][distributed] exact ray placement control
 (#12732)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml               |   2 +
 examples/offline_inference/ray_placement.py | 121 ++++++++++++++++++++
 vllm/envs.py                                |  14 +++
 vllm/executor/ray_distributed_executor.py   |  36 +++---
 vllm/platforms/cuda.py                      |   8 ++
 vllm/platforms/interface.py                 |   5 +
 6 files changed, 173 insertions(+), 13 deletions(-)
 create mode 100644 examples/offline_inference/ray_placement.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a847a68a6..7ef40564c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -128,6 +128,7 @@ steps:
   - tests/spec_decode/e2e/test_integration_dist_tp4
   - tests/compile
   - examples/offline_inference/rlhf.py
+  - examples/offline_inference/ray_placement.py
   commands:
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
@@ -136,6 +137,7 @@ steps:
   # TODO: create a dedicated test section for multi-GPU example tests
   # when we have multiple distributed example tests
   - python3 ../examples/offline_inference/rlhf.py
+  - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/ray_placement.py
 
 - label: Metrics, Tracing Test # 10min
   num_gpus: 2 
diff --git a/examples/offline_inference/ray_placement.py b/examples/offline_inference/ray_placement.py
new file mode 100644
index 000000000..cd801a3c0
--- /dev/null
+++ b/examples/offline_inference/ray_placement.py
@@ -0,0 +1,121 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+a simple demonstration to show how to control
+the placement of the vLLM workers with Ray.
+The key is to set VLLM_RAY_PER_WORKER_GPUS and
+VLLM_RAY_BUNDLE_INDICES properly.
+"""
+import os
+
+import ray
+from ray.util.placement_group import placement_group
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+from vllm import LLM
+from vllm.worker.worker import Worker
+
+
+class MyWorker(Worker):
+
+    def report_device_id(self) -> str:
+        from vllm.platforms import current_platform
+        return current_platform.get_device_uuid(self.device.index)
+
+
+class MyLLM(LLM):
+
+    def __init__(self, *args, bundle_indices: list, **kwargs):
+        # a hack to make the script work.
+        # stop ray from manipulating CUDA_VISIBLE_DEVICES
+        # at the top-level
+        del os.environ["CUDA_VISIBLE_DEVICES"]
+        # every worker will use 0.4 GPU, so that we can schedule
+        # 2 instances on the same GPUs.
+        os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
+        os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(
+            map(str, bundle_indices))
+        print(f"creating LLM with bundle_indices={bundle_indices}")
+        super().__init__(*args, **kwargs)
+
+
+class RayTrainingActor:
+
+    def report_device_id(self) -> str:
+        # the argument for get_device_uuid is the index
+        # of the GPU in the visible devices.
+        # ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
+        from vllm.platforms import current_platform
+        return current_platform.get_device_uuid(0)
+
+
+# ray manages 4 GPUs
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+ray.init()
+
+# we want to co-locate vLLM instance and the training actor
+# on the same set of GPUs.
+# the placement plan is as follows:
+# GPU 0 and 1: training actor 0, 1, and vLLM instance 0 (with TP=2)
+# GPU 2 and 3: training actor 2, 3, and vLLM instance 1 (with TP=2)
+
+pg = placement_group([{"GPU": 1, "CPU": 0}] * 4)
+ray.get(pg.ready())
+print(f"placement group has bundles {pg.bundle_specs=}")
+
+training_actors = []
+training_actor_device_ids = []
+inference_engines = []
+inference_engine_device_ids = []
+
+for bundle_index in [0, 1, 2, 3]:
+    training_actor = ray.remote(
+        num_cpus=0,
+        num_gpus=0.4,
+        scheduling_strategy=PlacementGroupSchedulingStrategy(
+            placement_group=pg,
+            placement_group_capture_child_tasks=True,
+            placement_group_bundle_index=bundle_index,
+        ),
+    )(RayTrainingActor).remote()
+    training_actors.append(training_actor)
+    device_id = ray.get(training_actor.report_device_id.remote())
+    print(f"training actor {bundle_index} is on {device_id}")
+    training_actor_device_ids.append(device_id)
+
+for (i, bundle_indices) in enumerate([[0, 1], [2, 3]]):
+    # IMPORTANT: when creating vLLM instances, we need to
+    # make sure there are no GPU activities on the target GPUs,
+    # otherwise, they will interfere with the vLLM memory profiling,
+    # and cause unexpected behaviors.
+    llm = ray.remote(
+        num_cpus=0,
+        num_gpus=0,
+        scheduling_strategy=PlacementGroupSchedulingStrategy(
+            placement_group=pg,
+            placement_group_capture_child_tasks=True,
+        ),
+    )(MyLLM).remote(
+        model="facebook/opt-125m",
+        enforce_eager=True,
+        worker_cls=MyWorker,
+        tensor_parallel_size=2,
+        distributed_executor_backend="ray",
+        gpu_memory_utilization=0.4,
+        bundle_indices=bundle_indices,
+    )
+    inference_engines.append(llm)
+    # don't call any method on the inference engine here,
+    # otherwise it will block until the vLLM instance is created.
+
+for i, llm in enumerate(inference_engines):
+    inference_engine_device_ids.append(
+        ray.get(llm.collective_rpc.remote("report_device_id", args=tuple())))
+    print(f"inference engine {i} is on {inference_engine_device_ids[-1]}")
+
+# check the placement
+# the first two training actors should be
+# on the same GPUs as the first inference engine
+assert training_actor_device_ids[:2] == inference_engine_device_ids[0]
+# the last two training actors should be
+# on the same GPUs as the second inference engine
+assert training_actor_device_ids[2:] == inference_engine_device_ids[1]
diff --git a/vllm/envs.py b/vllm/envs.py
index bb419dacb..745b068b7 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -85,6 +85,8 @@ if TYPE_CHECKING:
     VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
     VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True
     VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
+    VLLM_RAY_PER_WORKER_GPUS: float = 1.0
+    VLLM_RAY_BUNDLE_INDICES: str = ""
 
 
 def get_default_cache_root():
@@ -550,6 +552,18 @@ environment_variables: Dict[str, Callable[[], Any]] = {
     lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
                  ),
 
+    # Number of GPUs per worker in Ray, if it is set to be a fraction,
+    # it allows ray to schedule multiple actors on a single GPU,
+    # so that users can colocate other actors on the same GPUs as vLLM.
+    "VLLM_RAY_PER_WORKER_GPUS":
+    lambda: float(os.getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")),
+
+    # Bundle indices for Ray, if it is set, it can control precisely
+    # which indices are used for the Ray bundle, for every worker.
+    # Format: comma-separated list of integers, e.g. "0,1,2,3"
+    "VLLM_RAY_BUNDLE_INDICES":
+    lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""),
+
     # When on a Nvidia GPU aligns single entries (within a page) so they are 256
     # byte aligned for better performance, this increases the memory usage of
     # the cache. Currently this only affects MLA that results in non-256
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 80e7a1c40..6a25a4d50 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -129,13 +129,7 @@ class RayDistributedExecutor(DistributedExecutorBase):
 
     def _init_workers_ray(self, placement_group: "PlacementGroup",
                           **ray_remote_kwargs):
-        if (self.parallel_config.tensor_parallel_size == 1
-                and self.parallel_config.pipeline_parallel_size == 1):
-            # For single GPU case, we use a ray worker with constrained memory.
-            num_gpus = self.cache_config.gpu_memory_utilization
-        else:
-            # Otherwise, the ray workers are allocated with a full GPU.
-            num_gpus = 1
+        num_gpus = envs.VLLM_RAY_PER_WORKER_GPUS
 
         # The driver dummy worker does not actually use any resources.
         # It holds the resource for the driver worker.
@@ -155,12 +149,29 @@ class RayDistributedExecutor(DistributedExecutorBase):
         logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
 
         # Create the workers.
-        driver_ip = get_ip()
-        rank = 0
+        bundle_indices: List[int]
+        if envs.VLLM_RAY_BUNDLE_INDICES:
+            # Use the bundle indices specified by the user.
+            bundle_indices = list(
+                map(int, envs.VLLM_RAY_BUNDLE_INDICES.split(",")))
+            assert len(bundle_indices) == self.parallel_config.world_size, \
+            ("VLLM_RAY_BUNDLE_INDICES must have the same size"
+            f" as the world size, but got {bundle_indices=} "
+            f"and {self.parallel_config.world_size=}")
+            assert len(set(bundle_indices)) == len(bundle_indices), \
+            ("VLLM_RAY_BUNDLE_INDICES cannot have duplicate values,"
+            f" but got {bundle_indices=}")
+        else:
+            # use the first N bundles that have GPU resources.
+            bundle_indices = []
+            for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+                if bundle.get(current_platform.ray_device_key, 0):
+                    bundle_indices.append(bundle_id)
+            bundle_indices = bundle_indices[:self.parallel_config.world_size]
+
         worker_metadata: List[RayWorkerMetaData] = []
-        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-            if not bundle.get(current_platform.ray_device_key, 0):
-                continue
+        driver_ip = get_ip()
+        for rank, bundle_id in enumerate(bundle_indices):
             scheduling_strategy = PlacementGroupSchedulingStrategy(
                 placement_group=placement_group,
                 placement_group_capture_child_tasks=True,
@@ -187,7 +198,6 @@ class RayDistributedExecutor(DistributedExecutorBase):
                                            rpc_rank=rank)
             worker_metadata.append(
                 RayWorkerMetaData(worker=worker, created_rank=rank))
-            rank += 1
 
         worker_ips = ray.get([
             each.worker.get_node_ip.remote()  # type: ignore[attr-defined]
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index b49852a72..991d55ac8 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -275,6 +275,14 @@ class NvmlCudaPlatform(CudaPlatformBase):
         physical_device_id = device_id_to_physical_device_id(device_id)
         return cls._get_physical_device_name(physical_device_id)
 
+    @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        return pynvml.nvmlDeviceGetUUID(handle)
+
     @classmethod
     @lru_cache(maxsize=8)
     @with_nvml_context
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index dc6545c93..211e288b1 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -183,6 +183,11 @@ class Platform:
         """Get the name of a device."""
         raise NotImplementedError
 
+    @classmethod
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        """Get the uuid of a device, e.g. the PCI bus ID."""
+        raise NotImplementedError
+
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         """Get the total memory of a device in bytes."""
-- 
GitLab


From 4c3aac51e14214880a3205b45c26aa16535992b5 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Thu, 6 Feb 2025 05:24:26 +0800
Subject: [PATCH 51/65] Merging PR #12536

Merged via CLI script
---
 vllm/attention/layer.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 19ee89630..e4df7ffc5 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -156,9 +156,13 @@ class Attention(nn.Module):
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-        if self.calculate_kv_scales and \
-            attn_metadata.enable_kv_scales_calculation:
-            self.calc_kv_scales(key, value)
+        # NOTE: please avoid accessing `kv_cache` and `attn_metadata` arguments
+        # directly, use `self.kv_cache` and
+        # `get_forward_context().attn_metadata` instead.
+        if self.calculate_kv_scales:
+            ctx_attn_metadata = get_forward_context().attn_metadata
+            if ctx_attn_metadata.enable_kv_scales_calculation:
+                self.calc_kv_scales(key, value)
         if self.use_output:
             output = torch.empty_like(query)
             hidden_size = query.size(-1)
@@ -172,15 +176,27 @@ class Attention(nn.Module):
             if value is not None:
                 value = value.view(-1, self.num_kv_heads, self.head_size)
             if self.use_direct_call:
-                unified_attention_with_output(query, key, value, output,
-                                              self.layer_name)
+                forward_context: ForwardContext = get_forward_context()
+                ctx_attn_metadata = forward_context.attn_metadata
+                self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+                self.impl.forward(self,
+                                  query,
+                                  key,
+                                  value,
+                                  self_kv_cache,
+                                  ctx_attn_metadata,
+                                  output=output)
             else:
                 torch.ops.vllm.unified_attention_with_output(
                     query, key, value, output, self.layer_name)
             return output.view(-1, hidden_size)
         else:
             if self.use_direct_call:
-                return unified_attention(query, key, value, self.layer_name)
+                forward_context = get_forward_context()
+                ctx_attn_metadata = forward_context.attn_metadata
+                self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+                return self.impl.forward(self, query, key, value,
+                                         self_kv_cache, ctx_attn_metadata)
             else:
                 return torch.ops.vllm.unified_attention(
                     query, key, value, self.layer_name)
-- 
GitLab


From af8486de49a200a980f71fddc6d1eb4d8f9f1bca Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Thu, 6 Feb 2025 02:59:45 +0530
Subject: [PATCH 52/65] [Hardware][Intel-Gaudi] Enable FusedSDPA support for
 Intel Gaudi (HPU)

---
 vllm/attention/backends/hpu_attn.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 1518e518e..1ad5e6e8e 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -10,7 +10,8 @@ from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
 import vllm_hpu_extension.ops as ops
-from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
+from vllm_hpu_extension.utils import (Matmul, ModuleFusedSDPA, Softmax,
+                                      VLLMKVCache)
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer,
@@ -137,9 +138,17 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
 
         self.prefill_usefusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
                                               '0').lower() in ['1', 'true']
+        self.fused_scaled_dot_product_attention = None
         if self.prefill_usefusedsdpa:
             assert alibi_slopes is None, \
                 'Prefill with FusedSDPA not supported with alibi slopes!'
+            try:
+                from habana_frameworks.torch.hpex.kernels import FusedSDPA
+                self.fused_scaled_dot_product_attention = ModuleFusedSDPA(
+                    FusedSDPA)
+            except ImportError:
+                logger().warning("Could not import HPU FusedSDPA kernel. "
+                                 "vLLM will use native implementation.")
 
         suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
         if head_size not in suppored_head_sizes:
@@ -227,6 +236,7 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
                 matmul_qk_op=self.matmul_qk,
                 softmax_op=self.softmax,
                 matmul_av_op=self.matmul_av,
+                fsdpa_op=self.fused_scaled_dot_product_attention,
             )
             output = out.reshape(batch_size, seq_len, hidden_size)
         else:
-- 
GitLab


From 3b2005e1db79efe2ea4587035eb2e7ced6e258cb Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Wed, 5 Feb 2025 15:30:43 -0600
Subject: [PATCH 53/65] Add: Support for Sparse24Bitmask Compressed Models

---
 .../SparseLlama3.1_2of4_fp8_compressed.yaml   |  11 +
 tests/quantization/test_compressed_tensors.py | 332 +++++++++++++++---
 .../compressed_tensors/compressed_tensors.py  |  34 +-
 .../schemes/compressed_tensors_24.py          | 238 ++++++++++---
 4 files changed, 503 insertions(+), 112 deletions(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml

diff --git a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
new file mode 100644
index 000000000..2928d75ce
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
@@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
+model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.6353
+  - name: "exact_match,flexible-extract"
+    value: 0.637
+limit: null
+num_fewshot: null 
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 7e2e6f6ed..0655f2b38 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -3,6 +3,7 @@
 
 Run `pytest tests/quantization/test_compressed_tensors.py`.
 """
+
 from typing import Optional
 
 import pytest
@@ -22,12 +23,30 @@ from vllm.platforms import current_platform
 
 @pytest.mark.parametrize(
     "model_args",
-    [("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor",
-      QuantizationType.INT, 2560, True),
-     ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel",
-      QuantizationType.INT, 2560, True),
-     ("nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", "tensor",
-      QuantizationType.INT, 2560, False)])
+    [
+        (
+            "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+            "tensor",
+            QuantizationType.INT,
+            2560,
+            True,
+        ),
+        (
+            "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
+            "channel",
+            QuantizationType.INT,
+            2560,
+            True,
+        ),
+        (
+            "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
+            "tensor",
+            QuantizationType.INT,
+            2560,
+            False,
+        ),
+    ],
+)
 def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
     model_path, strategy, quant_type, shape_0, is_symmetric = model_args
     with vllm_runner(model_path, enforce_eager=True) as llm:
@@ -85,21 +104,31 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
         assert output
 
 
-@pytest.mark.parametrize("model_path", [
-    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
-])
+@pytest.mark.parametrize(
+    "model_path",
+    [
+        "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
+        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
+    ],
+)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
-                                          example_prompts, model_path,
-                                          max_tokens, num_logprobs):
+def test_compressed_tensors_w8a8_logprobs(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model_path,
+    max_tokens,
+    num_logprobs,
+):
     dtype = "bfloat16"
 
     # skip language translation prompt for the static per tensor asym model
-    if model_path == "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym":  # noqa: E501
+    if (model_path ==
+            "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
+        ):  # noqa: E501
         example_prompts = example_prompts[0:-1]
 
     with hf_runner(model_path, dtype=dtype) as hf_model:
@@ -125,13 +154,21 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
         assert output
 
 
-@pytest.mark.parametrize("model_args", [
-    ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
-    ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"),
-    ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"),
-    ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
-     "channel"),
-])
+@pytest.mark.parametrize(
+    "model_args",
+    [
+        ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
+        ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"),
+        (
+            "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
+            "channel",
+        ),
+        (
+            "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
+            "channel",
+        ),
+    ],
+)
 def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
     model_path, strategy = model_args
     with vllm_runner(model_path, dtype=torch.float16) as llm:
@@ -156,9 +193,12 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
 
 @pytest.mark.parametrize(
     "wNa16_args",
-    [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
-     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
-     ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)])
+    [
+        ("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
+        ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
+        ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4),
+    ],
+)
 def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
     model, strategy, group, pack_factor = wNa16_args
     with vllm_runner(model) as llm:
@@ -218,7 +258,8 @@ def test_compressed_tensors_fp8(vllm_runner):
                               CompressedTensorsLinearMethod)
             assert isinstance(
                 qkv_proj.scheme,
-                (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8))
+                (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8),
+            )
 
             assert qkv_proj.input_scale.dtype is torch.float32
 
@@ -241,9 +282,14 @@ def test_compressed_tensors_kv_cache(vllm_runner):
         assert output
 
 
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse FP8 is not yet supported on this GPU type.")
-def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse FP8 is not yet supported on this GPU type.",
+)
+def _test_2of4_quant_models(qkv_proj,
+                            weight_strategy,
+                            input_strategy,
+                            format="dense"):
     assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
     assert isinstance(qkv_proj.scheme, CompressedTensors24)
 
@@ -252,22 +298,39 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
     assert qkv_proj.scheme.quantized
     assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
     sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
-    assert sparsity_map.get("Linear").format == "dense"
+    assert sparsity_map.get("Linear").format == format
     assert sparsity_map.get("Linear").sparsity_structure == "2:4"
 
 
-@pytest.mark.skipif(not current_platform.has_device_capability(90),
-                    reason="Sparse FP8 is not yet supported on this GPU type.")
-@pytest.mark.parametrize("args_2of4", [
-    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel",
-     "token"),
-    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
-     "channel", "tensor"),
-    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", "tensor",
-     "tensor"),
-    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
-     "tensor", "token"),
-])
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(90),
+    reason="Sparse FP8 is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [
+        (
+            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
+            "channel",
+            "token",
+        ),
+        (
+            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
+            "channel",
+            "tensor",
+        ),
+        (
+            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
+            "tensor",
+            "tensor",
+        ),
+        (
+            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+            "tensor",
+            "token",
+        ),
+    ],
+)
 def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
     with vllm_runner(model) as llm:
@@ -286,16 +349,134 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
         assert output
 
 
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse FP8 is not yet supported on this GPU type.")
-@pytest.mark.parametrize("args_2of4", [
-    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
-     "channel", "token"),
-    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", "tensor",
-     "tensor"),
-    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
-     "tensor", "token"),
-])
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(90),
+    reason="Sparse FP8 is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
+            "channel",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
+            "channel",
+            "tensor",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
+            "tensor",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
+            "tensor",
+            "tensor",
+        ),
+    ],
+)
+def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
+            _test_2of4_quant_models(
+                qkv_proj,
+                weight_strategy,
+                input_strategy,
+                format="sparse-24-bitmask",
+            )
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="cutlass is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
+            "channel",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
+            "channel",
+            "tensor",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
+            "tensor",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
+            "tensor",
+            "tensor",
+        ),
+    ],
+)
+def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert qkv_proj.scheme.weights_dtype == torch.int8
+            _test_2of4_quant_models(
+                qkv_proj,
+                weight_strategy,
+                input_strategy,
+                format="sparse-24-bitmask",
+            )
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse FP8 is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
+            "channel",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
+            "tensor",
+            "tensor",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
+            "tensor",
+            "token",
+        ),
+    ],
+)
 def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
     with vllm_runner(model) as llm:
@@ -317,10 +498,12 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
 @pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
 @pytest.mark.skipif(
     not sparse_cutlass_supported(),
-    reason="2of4 Sparse is not yet supported on this GPU type.")
+    reason="2of4 Sparse is not yet supported on this GPU type.",
+)
 @pytest.mark.parametrize(
     "args_2of4",
-    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")],
+)
 def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
     model = args_2of4
     with vllm_runner(model) as llm:
@@ -337,7 +520,9 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
             assert qkv_proj.scheme.input_quant is None
             assert not qkv_proj.scheme.quantized
             assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+            sparsity_map = (
+                qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+            )  # noqa: E501
             assert sparsity_map.get("Linear").format == "dense"
             assert sparsity_map.get("Linear").sparsity_structure == "2:4"
 
@@ -346,3 +531,38 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         print(output)
         assert output
+
+
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Cutlass is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4", [("nm-testing/llama2.c-stories42M-pruned2.4-compressed")])
+def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
+    model = args_2of4
+    with vllm_runner(model) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+            assert qkv_proj.scheme.weight_quant is None
+            assert qkv_proj.scheme.input_quant is None
+            assert not qkv_proj.scheme.quantized
+            assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+            sparsity_map = (
+                qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+            )  # noqa: E501
+            assert sparsity_map.get("Linear").format == "sparse-24-bitmask"
+            assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 0e3258e4a..6ee3e9362 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -417,15 +417,22 @@ class CompressedTensorsConfig(QuantizationConfig):
                 return None
             # Have a valid sparsity scheme
             # Validate layer is supported by Cutlass 2:4 Kernel
-            scheme = CompressedTensors24(quantized=weight_quant is not None
-                                         or input_quant is not None,
-                                         weight_quant=weight_quant,
-                                         input_quant=input_quant)
+            model_compression_config = (None if sparsity_scheme is None
+                                        or sparsity_scheme.format == "dense"
+                                        else self.config)
+
+            scheme = CompressedTensors24(
+                quantized=weight_quant is not None or input_quant is not None,
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+                model_compression_config=model_compression_config,
+            )
         elif weight_quant is None:
             logger.warning_once("Acceleration for non-quantized schemes is "
                                 "not supported by Compressed Tensors. "
                                 "Falling back to UnquantizedLinearMethod")
             return None
+
         else:
             # Find the quant_scheme
             scheme = self._get_scheme_from_parts(  # type: ignore
@@ -475,10 +482,21 @@ class CompressedTensorsConfig(QuantizationConfig):
         :return: True if the layer is supported by the Cutlass 2:4 Kernel
             False otherwise
         """
-        is_valid_sparsity = (sparsity_scheme is not None
-                             and sparsity_scheme.sparsity_structure
-                             == SparsityStructure.TWO_FOUR.value
-                             and sparsity_scheme.format == "dense")
+        if sparsity_scheme is None:
+            return False
+
+        is_valid_sparsity_structure: bool = (
+            sparsity_scheme.sparsity_structure ==
+            SparsityStructure.TWO_FOUR.value)
+
+        valid_compressors = {
+            CompressionFormat.dense.value,
+            CompressionFormat.sparse_24_bitmask.value
+        }
+
+        is_valid_sparsity = (is_valid_sparsity_structure
+                             and sparsity_scheme.format in valid_compressors)
+
         if not is_valid_sparsity:
             return False
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 84f924b23..0fb8dfa96 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -1,13 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Callable, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
+from compressed_tensors import CompressionFormat, ModelCompressor
 from compressed_tensors.quantization import (QuantizationArgs,
                                              QuantizationStrategy,
                                              QuantizationType)
+from compressed_tensors.utils import combine_shards
 
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
@@ -22,26 +26,39 @@ __all__ = ["CompressedTensors24"]
 
 class CompressedTensors24(CompressedTensorsScheme):
 
-    def __init__(self,
-                 quantized: bool = False,
-                 weight_quant: Optional[QuantizationArgs] = None,
-                 input_quant: Optional[QuantizationArgs] = None):
-
+    def __init__(
+        self,
+        quantized: bool = False,
+        weight_quant: Optional[QuantizationArgs] = None,
+        input_quant: Optional[QuantizationArgs] = None,
+        model_compression_config: Optional[Dict[str, Any]] = None,
+    ):
         self.quantized = quantized
         self.weight_quant = weight_quant
         self.input_quant = input_quant
+        self.model_compressor = (
+            ModelCompressor.from_compression_config(model_compression_config)
+            if model_compression_config is not None else None)
+        self.do_sparse_decompress = (
+            self.model_compressor is not None
+            and self.model_compressor.sparsity_config.format
+            == CompressionFormat.sparse_24_bitmask.value)
 
     @classmethod
     def get_min_capability(cls) -> int:
         # Only cutlass 3.x kernels are implemented so far
         return 90
 
-    def create_weights(self, layer: torch.nn.Module, input_size: int,
-                       output_partition_sizes: List[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
-
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size: int,
+        output_partition_sizes: List[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
         if not sparse_cutlass_supported():
             raise ValueError(
                 "Sparse CUTLASS not supported. vLLM must be built with "
@@ -49,16 +66,56 @@ class CompressedTensors24(CompressedTensorsScheme):
 
         self.output_dtype = params_dtype
         layer.logical_widths = output_partition_sizes
+        layer.input_size = input_size
+        layer.input_size_per_partition = input_size_per_partition
         self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
 
         # parameter to store uncompressed weight
-        weight = ModelWeightParameter(data=torch.empty(
-            sum(output_partition_sizes),
-            input_size_per_partition,
-            dtype=self.weights_dtype),
-                                      input_dim=1,
-                                      output_dim=0,
-                                      weight_loader=weight_loader)
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=self.weights_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        if self.do_sparse_decompress:
+            assert all(partition_size % 8 == 0
+                       for partition_size in output_partition_sizes
+                       ), "All partitions must be divisible by 8 for "
+            "2:4 sparse compressed models"
+
+            shape = BasevLLMParameter(
+                data=torch.empty(2, 1, dtype=torch.int64),
+                weight_loader=weight_loader,
+            )
+            compressed_weight = ModelWeightParameter(
+                data=torch.empty(
+                    sum(output_partition_sizes),
+                    input_size_per_partition // 2,
+                    dtype=self.weights_dtype,
+                ),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+
+            bitmask = ModelWeightParameter(
+                data=torch.empty(
+                    sum(output_partition_sizes),
+                    input_size_per_partition // 8,
+                    dtype=torch.uint8,
+                ),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+
+            layer.register_parameter("shape", shape)
+            layer.register_parameter("compressed", compressed_weight)
+            layer.register_parameter("bitmask", bitmask)
 
         # Check if quantized, not just 2:4 Sparse
         if self.quantized:
@@ -68,14 +125,16 @@ class CompressedTensors24(CompressedTensorsScheme):
                     data=torch.empty((sum(output_partition_sizes), 1),
                                      dtype=torch.float32),
                     output_dim=0,
-                    weight_loader=weight_loader)
+                    weight_loader=weight_loader,
+                )
             else:
                 assert (self.weight_quant and self.weight_quant.strategy
                         == QuantizationStrategy.TENSOR.value)
                 weight_scale = PerTensorScaleParameter(
                     data=torch.empty(len(output_partition_sizes),
                                      dtype=torch.float32),
-                    weight_loader=weight_loader)
+                    weight_loader=weight_loader,
+                )
 
             layer.register_parameter("weight_scale", weight_scale)
 
@@ -84,9 +143,10 @@ class CompressedTensors24(CompressedTensorsScheme):
                 # register input quant scale
                 assert (self.input_quant.strategy ==
                         QuantizationStrategy.TENSOR.value)
-                input_scale = BasevLLMParameter(data=torch.empty(
-                    1, dtype=torch.float32),
-                                                weight_loader=weight_loader)
+                input_scale = BasevLLMParameter(
+                    data=torch.empty(1, dtype=torch.float32),
+                    weight_loader=weight_loader,
+                )
 
                 layer.register_parameter("input_scale", input_scale)
 
@@ -107,13 +167,25 @@ class CompressedTensors24(CompressedTensorsScheme):
         """
         Compress weights after loading. Store compressed weight and meta
             tensor
-        
+
         :post-condition: layer.w_compressed and layer.meta are
             set to the compressed weight and meta tensor in the
             format expected by the Cutlass kernels
         :param layer: The layer with the weights to be processed
-        
+
         """
+        if self.do_sparse_decompress:
+            layer.weight.data = self._decompress_bitmask_compressed_weight(
+                compressed=layer.compressed,
+                bitmask=layer.bitmask,
+                layer=layer,
+            )
+
+            # compressed and bitmask tensors
+            # are no longer needed after decompression
+            del layer.compressed
+            del layer.bitmask
+
         # torch.compile workaround
         if hasattr(layer, "input_scale"):
             layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
@@ -121,10 +193,13 @@ class CompressedTensors24(CompressedTensorsScheme):
 
         if self.weight_quant:
             if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value:
-                layer.weight_scale = torch.nn.Parameter(convert_to_channelwise(
-                    weight_scale=layer.weight_scale,
-                    logical_widths=layer.logical_widths),
-                                                        requires_grad=False)
+                layer.weight_scale = torch.nn.Parameter(
+                    convert_to_channelwise(
+                        weight_scale=layer.weight_scale,
+                        logical_widths=layer.logical_widths,
+                    ),
+                    requires_grad=False,
+                )
             else:
                 # torch.compile workaround
                 layer.weight_scale = torch.nn.Parameter(
@@ -134,20 +209,22 @@ class CompressedTensors24(CompressedTensorsScheme):
         layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
         layer.meta = torch.nn.Parameter(meta, requires_grad=False)
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         """
-        Returns the output tensor for the layer with 2:4 
+        Returns the output tensor for the layer with 2:4
         sparse compressed weights, given the input tensor
         and bias
 
-        :param layer: The layer with 2:4 sparse compressed 
+        :param layer: The layer with 2:4 sparse compressed
             weights to be used for the computation
         :param x: The input tensor to the layer
         :param bias: The bias to be added to the output tensor
-        :return: The output tensor of the layer 
+        :return: The output tensor of the layer
         """
         if self.quantized:
             scale = None
@@ -171,13 +248,15 @@ class CompressedTensors24(CompressedTensorsScheme):
             input_scale = layer.input_scale
             q_input = x
 
-        out = ops.cutlass_scaled_sparse_mm(a=q_input,
-                                           bt_nzs=layer.weight,
-                                           bt_meta=layer.meta,
-                                           scale_a=input_scale,
-                                           scale_b=layer.weight_scale,
-                                           out_dtype=self.output_dtype,
-                                           bias=bias)
+        out = ops.cutlass_scaled_sparse_mm(
+            a=q_input,
+            bt_nzs=layer.weight,
+            bt_meta=layer.meta,
+            scale_a=input_scale,
+            scale_b=layer.weight_scale,
+            out_dtype=self.output_dtype,
+            bias=bias,
+        )
         assert out.is_contiguous()
         return out
 
@@ -203,8 +282,71 @@ class CompressedTensors24(CompressedTensorsScheme):
 
         raise ValueError("Quantization type not supported by Cutlass")
 
+    def _decompress_bitmask_compressed_weight(
+        self,
+        compressed: torch.Tensor,
+        bitmask: torch.Tensor,
+        layer: torch.nn.Module,
+    ) -> torch.Tensor:
+        """
+        Decompress a compressed 2:4 sparse weight tensor using the bitmask and
+        return the result.
+
+        This function also supports sharded decompression.
+
+        :param compressed: The 2:4 sparse weight tensor compressed using the
+            sparse-24-bitmask compressor. This is different from
+            `cutlass_sparse_compress` which uses a different scheme (2 bits for
+            every nonzero element that represent the coordinate within the block
+            of 4). The bitmask compression here uses a bitmask to indicate the
+            positions of non-zero elements.
+        :param bitmask: The 2:4 bitmask associated with the compressed weights,
+            representing the positions of non-zero elements in the compressed
+            tensor.
+        :param layer: The layer whose weights need to be processed after 
+            loading.
+        :return: The decompressed 2:4 sparse weight tensor.
+        """
 
-def check_24(tensor):
-    new_tensor = tensor.view(-1, 4)
-    zero_counts = (new_tensor == 0).sum(dim=1)
-    return (zero_counts >= 2).all().item()
+        sparsity_compressor = self.model_compressor.sparsity_compressor
+
+        def _process_split(
+            bitmask_compressed_weight: torch.Tensor,
+            shape,
+            bitmask: torch.Tensor,
+        ) -> torch.Tensor:
+            weight_data = dict(
+                compressed=bitmask_compressed_weight,
+                shape=shape,
+                bitmask=bitmask,
+            )
+            return sparsity_compressor.decompress_weight(weight_data)
+
+        split_weights: List[torch.Tensor] = []
+        split_bitmask: List[torch.Tensor] = []
+        split_shape: List[Tuple[int, int]] = []
+
+        if isinstance(layer, (QKVParallelLinear, MergedColumnParallelLinear)):
+            split_weights = torch.split(compressed, layer.logical_widths)
+            split_bitmask = torch.split(bitmask, layer.logical_widths)
+            split_shape = [(out, layer.input_size_per_partition)
+                           for out in layer.logical_widths]
+
+        if split_weights:
+            decompressed_shards = [
+                _process_split(compressed_weight, shape, bitmask)
+                for compressed_weight, shape, bitmask in zip(
+                    split_weights, split_shape, split_bitmask)
+            ]
+            decompressed = combine_shards(decompressed_shards)
+        else:
+            decompressed = sparsity_compressor.decompress_weight(
+                dict(
+                    compressed=compressed,
+                    shape=(
+                        layer.logical_widths[0],
+                        layer.input_size_per_partition,
+                    ),
+                    bitmask=bitmask,
+                ))
+        return decompressed
-- 
GitLab


From a4ce74c14a469b178c446a34ce48f158909a8e74 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 6 Feb 2025 05:30:46 +0800
Subject: [PATCH 54/65] [VLM] Use shared field to pass token ids to model

---
 vllm/model_executor/models/internvl.py |   6 +-
 vllm/multimodal/inputs.py              | 275 +++++++++++++++++++++----
 2 files changed, 235 insertions(+), 46 deletions(-)

diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 08fc659ab..380eb40d9 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -564,8 +564,7 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
         # Since there may be extra tokens in the feature placeholders,
         # we need to pass the image token ID to the model to select the
         # tokens to merge from the vision encoder outputs
-        processed_outputs["image_token_id"] = [image_token_id
-                                               ] * len(image_data)
+        processed_outputs["image_token_id"] = torch.tensor(image_token_id)
 
         return processed_outputs
 
@@ -575,13 +574,14 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
         image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
+        num_images = len(image_num_patches)
 
         return dict(
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_num_patches),
             image_num_patches=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
-            image_token_id=MultiModalFieldConfig.batched("image"),
+            image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
 
     def _get_prompt_replacements(
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 8e4af7f88..2f2535f36 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -4,6 +4,7 @@ from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
+from functools import partial
 from itertools import accumulate
 from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
                     Union, cast, final)
@@ -164,51 +165,112 @@ A dictionary containing nested tensors which have been batched via
 
 @dataclass(frozen=True)
 class MultiModalFieldElem:
-    """Contains metadata and data of an item in :class:`MultiModalKwargs`."""
-    field: "BaseMultiModalField"
+    """
+    Represents a keyword argument corresponding to a multi-modal item
+    in :class:`MultiModalKwargs`.
+    """
+
+    modality: str
+    """
+    The modality of the corresponding multi-modal item.
+    Each multi-modal item can consist of multiple keyword arguments.
+    """
+
+    key: str
+    """
+    The key of this field in :class:`MultiModalKwargs`,
+    i.e. the name of the keyword argument to be passed to the model.
+    """
+
     data: NestedTensors
+    """
+    The tensor data of this field in :class:`MultiModalKwargs`,
+    i.e. the value of the keyword argument to be passed to the model.
+    """
+
+    field: "BaseMultiModalField"
+    """
+    Defines how to combine the tensor data of this field with others
+    in order to batch multi-modal items together for model inference.
+    """
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, self.__class__):
             return False
 
-        return (self.field == other.field
-                and nested_tensors_equal(self.data, other.data))
+        return ((self.modality, self.key) == (other.modality, other.key)
+                and nested_tensors_equal(self.data, other.data)
+                and type(self.field) == type(other.field))  # noqa: E721
 
 
 @dataclass(frozen=True)
 class BaseMultiModalField(ABC):
-    """Abstract base class for a field in :class:`MultiModalKwargs`."""
-    key: str
-    modality: str
+    """
+    Defines how to interpret tensor data belonging to a keyword argument in
+    :class:`MultiModalKwargs` for multiple multi-modal items, and vice versa.
+    """
+
+    def _field_factory(self, *, modality: str, key: str):
+        f = partial(
+            MultiModalFieldElem,
+            modality=modality,
+            key=key,
+            field=self,
+        )
+
+        # Allow passing data as positional argument
+        def factory(data: NestedTensors) -> MultiModalFieldElem:
+            return f(data=data)
+
+        return factory
 
     @abstractmethod
-    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+    def build_elems(
+        self,
+        modality: str,
+        key: str,
+        data: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        """
+        Construct :class:`MultiModalFieldElem` instances to represent
+        the provided data.
+        
+        This is the inverse of :meth:`reduce_data`.
+        """
         raise NotImplementedError
 
-    def _build_elem(self, data: NestedTensors) -> MultiModalFieldElem:
-        return MultiModalFieldElem(self, data)
+    @abstractmethod
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        raise NotImplementedError
 
-    def reduce(self, batch: list[MultiModalFieldElem]) -> MultiModalFieldElem:
-        """Merge multiple instances of :class:`MultiModalFieldElem` together."""
-        fields = [item.field for item in batch]
-        if len(set(fields)) > 1:
-            raise ValueError(f"Cannot merge different {fields=}")
+    def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
+        """
+        Merge the data from multiple instances of :class:`MultiModalFieldElem`.
 
-        data = self._reduce_data([item.data for item in batch])
+        This is the inverse of :meth:`build_elems`.
+        """
+        field_types = [type(item.field) for item in elems]
+        if len(set(field_types)) > 1:
+            raise ValueError(f"Cannot merge different {field_types=}")
 
-        return self._build_elem(data)
+        return self._reduce_data([item.data for item in elems])
 
 
 @dataclass(frozen=True)
 class MultiModalBatchedField(BaseMultiModalField):
     """
-    A :class:`BaseMultiModalField` implementation where an element in the batch
-    is obtained by indexing into the first dimension of the underlying data.
+    See also:
+        :func:`MultiModalFieldConfig.batched`
     """
 
-    def build_elems(self, batch: NestedTensors) -> list[MultiModalFieldElem]:
-        return [self._build_elem(item) for item in batch]
+    def build_elems(
+        self,
+        modality: str,
+        key: str,
+        data: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        field_factory = self._field_factory(modality=modality, key=key)
+        return [field_factory(item) for item in data]
 
     def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
@@ -227,16 +289,20 @@ class MultiModalBatchedField(BaseMultiModalField):
 @dataclass(frozen=True)
 class MultiModalFlatField(BaseMultiModalField):
     """
-    A :class:`BaseMultiModalField` implementation where an element in the batch
-    is obtained by slicing along the first dimension of the underlying data.
+    See also:
+        :func:`MultiModalFieldConfig.flat`
+        :func:`MultiModalFieldConfig.flat_from_sizes`
     """
+    slices: Sequence[slice]
 
     def build_elems(
         self,
-        batch: NestedTensors,
-        slices: Sequence[slice],
-    ) -> list[MultiModalFieldElem]:
-        return [self._build_elem(batch[slice_]) for slice_ in slices]
+        modality: str,
+        key: str,
+        data: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        field_factory = self._field_factory(modality=modality, key=key)
+        return [field_factory(data[s]) for s in self.slices]
 
     def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
@@ -252,25 +318,121 @@ class MultiModalFlatField(BaseMultiModalField):
         return [e for elem in batch for e in elem]
 
 
+@dataclass(frozen=True)
+class MultiModalSharedField(BaseMultiModalField):
+    """
+    See also:
+        :func:`MultiModalFieldConfig.shared`
+    """
+    batch_size: int
+
+    def build_elems(
+        self,
+        modality: str,
+        key: str,
+        data: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        field_factory = self._field_factory(modality=modality, key=key)
+        return [field_factory(data)] * self.batch_size
+
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        return batch[0]
+
+
 class MultiModalFieldConfig:
 
     @staticmethod
     def batched(modality: str):
+        """
+        Defines a field where an element in the batch is obtained by
+        indexing into the first dimension of the underlying data.
+
+        Args:
+            modality: The modality of the multi-modal item that uses this
+                keyword argument.
+
+        Example:
+
+        .. code-block::
+
+            Input:
+                Data: [[AAAA]
+                       [BBBB]
+                       [CCCC]]
+
+            Output:
+                Element 1: [AAAA]
+                Element 2: [BBBB]
+                Element 3: [CCCC]
+        """
         return MultiModalFieldConfig(
-            field_cls=MultiModalBatchedField,
+            field=MultiModalBatchedField(),
             modality=modality,
         )
 
     @staticmethod
     def flat(modality: str, slices: Sequence[slice]):
+        """
+        Defines a field where an element in the batch is obtained by
+        slicing along the first dimension of the underlying data.
+
+        Args:
+            modality: The modality of the multi-modal item that uses this
+                keyword argument.
+            slices: For each multi-modal item, a slice that is used to extract
+                the data corresponding to it.
+
+        Example:
+
+        .. code-block::
+    
+            Given:
+                slices: [slice(0, 3), slice(3, 7), slice(7, 9)]
+
+            Input:
+                Data: [AAABBBBCC]
+
+            Output:
+                Element 1: [AAA]
+                Element 2: [BBBB]
+                Element 3: [CC]
+        """
         return MultiModalFieldConfig(
-            field_cls=MultiModalFlatField,
+            field=MultiModalFlatField(slices=slices),
             modality=modality,
-            slices=slices,
         )
 
     @staticmethod
     def flat_from_sizes(modality: str, size_per_item: torch.Tensor):
+        """
+        Defines a field where an element in the batch is obtained by
+        slicing along the first dimension of the underlying data.
+
+        Args:
+            modality: The modality of the multi-modal item that uses this
+                keyword argument.
+            slices: For each multi-modal item, the size of the slice that
+                is used to extract the data corresponding to it.
+
+        Example:
+
+        .. code-block::
+    
+            Given:
+                size_per_item: [3, 4, 2]
+
+            Input:
+                Data: [AAABBBBCC]
+
+            Output:
+                Element 1: [AAA]
+                Element 2: [BBBB]
+                Element 3: [CC]
+    
+        See also:
+            :func:`MultiModalFieldConfig.flat`
+        """
+
         slice_idxs = [0, *accumulate(size_per_item)]
         slices = [
             slice(slice_idxs[i], slice_idxs[i + 1])
@@ -279,25 +441,52 @@ class MultiModalFieldConfig:
 
         return MultiModalFieldConfig.flat(modality, slices)
 
-    def __init__(
-        self,
-        field_cls: type[BaseMultiModalField],
-        modality: str,
-        **field_config: Any,
-    ) -> None:
+    @staticmethod
+    def shared(modality: str, batch_size: int):
+        """
+        Defines a field where an element in the batch is obtained by
+        taking the entirety of the underlying data.
+
+        This means that the data is the same for each element in the batch.
+
+        Args:
+            modality: The modality of the multi-modal item that uses this
+                keyword argument.
+            batch_size: The number of multi-modal items which share this data.
+
+        Example:
+
+        .. code-block::
+    
+            Given:
+                batch_size: 4
+
+            Input:
+                Data: [XYZ]
+
+            Output:
+                Element 1: [XYZ]
+                Element 2: [XYZ]
+                Element 3: [XYZ]
+                Element 4: [XYZ]
+        """
+        return MultiModalFieldConfig(
+            field=MultiModalSharedField(batch_size),
+            modality=modality,
+        )
+
+    def __init__(self, field: BaseMultiModalField, modality: str) -> None:
         super().__init__()
 
-        self.field_cls = field_cls
+        self.field = field
         self.modality = modality
-        self.field_config = field_config
 
     def build_elems(
         self,
         key: str,
         batch: NestedTensors,
     ) -> Sequence[MultiModalFieldElem]:
-        field = self.field_cls(key=key, modality=self.modality)
-        return field.build_elems(batch, **self.field_config)  # type: ignore
+        return self.field.build_elems(self.modality, key, batch)
 
 
 class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
@@ -308,11 +497,11 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
 
     @staticmethod
     def from_elems(elems: Sequence[MultiModalFieldElem]):
-        return MultiModalKwargsItem({elem.field.key: elem for elem in elems})
+        return MultiModalKwargsItem({elem.key: elem for elem in elems})
 
     @property
     def modality(self) -> str:
-        modalities = {elem.field.modality for elem in self.data.values()}
+        modalities = {elem.modality for elem in self.data.values()}
         assert len(modalities) == 1, f"Found different modalities={modalities}"
         return next(iter(modalities))
 
@@ -372,7 +561,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
                 elems_by_key[key].append(elem)
 
         data = {
-            key: elems[0].field.reduce(elems).data
+            key: elems[0].field.reduce_data(elems)
             for key, elems in elems_by_key.items() if len(elems) > 0
         }
 
-- 
GitLab


From 9a5b1554b4f049aad6398bb29d3064138ac9a039 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 5 Feb 2025 16:30:50 -0500
Subject: [PATCH 55/65] [Docs] Drop duplicate [source] links

---
 docs/source/conf.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index ea3b56e02..f4e8c8b94 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -37,7 +37,6 @@ author = 'the vLLM Team'
 # ones.
 extensions = [
     "sphinx.ext.napoleon",
-    "sphinx.ext.viewcode",
     "sphinx.ext.linkcode",
     "sphinx.ext.intersphinx",
     "sphinx_copybutton",
-- 
GitLab


From bf3b79efb82676219a3275764d8fcf4c70097ce5 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 5 Feb 2025 13:31:38 -0800
Subject: [PATCH 56/65] [VLM] Qwen2.5-VL

---
 docs/source/models/supported_models.md        |   11 +
 examples/offline_inference/vision_language.py |   31 +
 .../vision_language_multi_image.py            |   58 +
 .../vision_language/test_models.py            |   22 +
 .../multimodal/processing/test_common.py      |    1 +
 tests/models/registry.py                      |    2 +
 vllm/entrypoints/chat_utils.py                |    4 +-
 .../model_executor/layers/rotary_embedding.py |   58 +-
 vllm/model_executor/models/qwen2_5_vl.py      | 1133 +++++++++++++++++
 vllm/model_executor/models/qwen2_vl.py        |   16 +-
 vllm/model_executor/models/registry.py        |    1 +
 vllm/v1/worker/gpu_model_runner.py            |   12 +-
 vllm/worker/cpu_model_runner.py               |    9 +-
 vllm/worker/model_runner.py                   |    9 +-
 14 files changed, 1315 insertions(+), 52 deletions(-)
 create mode 100644 vllm/model_executor/models/qwen2_5_vl.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index d8e284292..3e8b2f896 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -846,6 +846,13 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
   * ✅︎
+- * `Qwen2_5_VLForConditionalGeneration`
+  * Qwen2.5-VL
+  * T + I<sup>E+</sup> + V<sup>E+</sup>
+  * `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc.
+  *
+  * ✅︎
+  * ✅︎
 - * `UltravoxModel`
   * Ultravox
   * T + A<sup>E+</sup>
@@ -880,6 +887,10 @@ The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingf
 A corrected version is available at <gh-file:examples/template_pixtral_hf.jinja>.
 :::
 
+:::{note}
+To use Qwen2.5-VL series models, you have to install Huggingface `transformers` library from source via `pip install git+https://github.com/huggingface/transformers`.
+:::
+
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 65940b6ad..436c36570 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -531,6 +531,36 @@ def run_qwen2_vl(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+# Qwen2.5-VL
+def run_qwen2_5_vl(question: str, modality: str):
+
+    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+    llm = LLM(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+            "fps": 1,
+        },
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+              f"{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
 model_example_map = {
     "aria": run_aria,
     "blip-2": run_blip2,
@@ -557,6 +587,7 @@ model_example_map = {
     "pixtral_hf": run_pixtral_hf,
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
+    "qwen2_5_vl": run_qwen2_5_vl,
 }
 
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 601ac96e1..8d2172a60 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -392,6 +392,63 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
+    try:
+        from qwen_vl_utils import process_vision_info
+    except ModuleNotFoundError:
+        print('WARNING: `qwen-vl-utils` not installed, input images will not '
+              'be automatically resized. You can enable this functionality by '
+              '`pip install qwen-vl-utils`.')
+        process_vision_info = None
+
+    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+    llm = LLM(
+        model=model_name,
+        max_model_len=32768 if process_vision_info is None else 4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    stop_token_ids = None
+
+    if process_vision_info is None:
+        image_data = [fetch_image(url) for url in image_urls]
+    else:
+        image_data, _ = process_vision_info(messages,
+                                            return_video_sample_fps=False)
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=image_data,
+        chat_template=None,
+    )
+
+
 model_example_map = {
     "aria": load_aria,
     "deepseek_vl_v2": load_deepseek_vl2,
@@ -404,6 +461,7 @@ model_example_map = {
     "pixtral_hf": load_pixtral_hf,
     "qwen_vl_chat": load_qwen_vl_chat,
     "qwen2_vl": load_qwen2_vl,
+    "qwen2_5_vl": load_qwen2_5_vl,
 }
 
 
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 85bc4ac13..95505dcf5 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -121,6 +121,8 @@ VLM_TEST_SETTINGS = {
                else ("half", "float")),
         marks=[pytest.mark.core_model],
     ),
+    # TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL
+    # once we upgraded to transformers>=4.49.0.
     "qwen2_vl": VLMTestInfo(
         models=["Qwen/Qwen2-VL-2B-Instruct"],
         test_type=(
@@ -138,6 +140,26 @@ VLM_TEST_SETTINGS = {
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
+    "qwen2_5_vl": VLMTestInfo(
+        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.skipif(
+                TRANSFORMERS_VERSION < "4.49.0",
+                reason="HF model requires transformers>=4.49.0",
+            ), pytest.mark.core_model, pytest.mark.cpu_model],
+    ),
     #### Extended model tests
     "aria": VLMTestInfo(
         models=["rhymes-ai/Aria"],
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 5cd749cbd..77cf3442d 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -161,6 +161,7 @@ def _test_processing_correctness(
     "nvidia/NVLM-D-72B",
     "Qwen/Qwen-VL-Chat",
     "Qwen/Qwen2-VL-2B-Instruct",
+    "Qwen/Qwen2.5-VL-3B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",
     "fixie-ai/ultravox-v0_3",
 ])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 285fbe484..20787fe00 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -264,6 +264,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                        trust_remote_code=True),
     "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
+    "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
+                                                          min_transformers_version="4.49"),  # noqa: E501
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3",
                                      trust_remote_code=True),
     # [Encoder-decoder]
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 3a6e75b1d..f04902ae1 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -410,7 +410,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                 return "<image>"
             if model_type == "mllama":
                 return "<|image|>"
-            if model_type == "qwen2_vl":
+            if model_type in ("qwen2_vl", "qwen2_5_vl"):
                 return "<|vision_start|><|image_pad|><|vision_end|>"
             if model_type == "molmo":
                 return ""
@@ -430,7 +430,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                 return "(<audio>./</audio>)"
             raise TypeError(f"Unknown model type: {model_type}")
         elif modality == "video":
-            if model_type == "qwen2_vl":
+            if model_type in ("qwen2_vl", "qwen2_5_vl"):
                 return "<|vision_start|><|video_pad|><|vision_end|>"
             if model_type in ("minicpmo", "minicpmv"):
                 return "(<video>./</video>)"
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 814c3b7d9..b3b9b0e87 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -27,6 +27,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
+from transformers import PretrainedConfig
 
 from vllm.model_executor.custom_op import CustomOp
 
@@ -772,8 +773,12 @@ class MRotaryEmbedding(RotaryEmbedding):
         dtype: torch.dtype,
         mrope_section: Optional[List[int]] = None,
     ) -> None:
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
+        # In Qwen2.5-VL, the maximum index value is related to the duration of
+        # the input video. We enlarge max_position_embeddings to 4 times to get
+        # a larger the cos and sin cache.
+        self.cache_max_position_num = max_position_embeddings * 4
+        super().__init__(head_size, rotary_dim, self.cache_max_position_num,
+                         base, is_neox_style, dtype)
 
         self.mrope_section = mrope_section
         if self.mrope_section:
@@ -831,13 +836,10 @@ class MRotaryEmbedding(RotaryEmbedding):
     @staticmethod
     def get_input_positions(
         input_tokens: List[int],
+        hf_config: PretrainedConfig,
         image_grid_thw: Union[List[List[int]], torch.Tensor],
         video_grid_thw: Union[List[List[int]], torch.Tensor],
-        image_token_id: int,
-        video_token_id: int,
-        vision_start_token_id: int,
-        vision_end_token_id: int,
-        spatial_merge_size: int,
+        second_per_grid_ts: Optional[List[float]] = None,
         context_len: int = 0,
         seq_len: Optional[int] = None,
     ) -> Tuple[List[List[int]], int]:
@@ -845,16 +847,13 @@ class MRotaryEmbedding(RotaryEmbedding):
 
         llm_positions, mrope_position_delta = \
             MRotaryEmbedding.get_input_positions_tensor(
-                input_tokens,
-                image_grid_thw,
-                video_grid_thw,
-                image_token_id,
-                video_token_id,
-                vision_start_token_id,
-                vision_end_token_id,
-                spatial_merge_size,
-                context_len,
-                seq_len,
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                context_len=context_len,
+                seq_len=seq_len,
             )
 
         return llm_positions.tolist(), mrope_position_delta
@@ -862,18 +861,22 @@ class MRotaryEmbedding(RotaryEmbedding):
     @staticmethod
     def get_input_positions_tensor(
         input_tokens: List[int],
+        hf_config: PretrainedConfig,
         image_grid_thw: Union[List[List[int]], torch.Tensor],
         video_grid_thw: Union[List[List[int]], torch.Tensor],
-        image_token_id: int,
-        video_token_id: int,
-        vision_start_token_id: int,
-        vision_end_token_id: int,
-        spatial_merge_size: int,
+        second_per_grid_ts: Optional[List[float]] = None,
         context_len: int = 0,
         seq_len: Optional[int] = None,
     ) -> Tuple[torch.Tensor, int]:
         """Get mrope input positions and delta value."""
 
+        image_token_id = hf_config.image_token_id
+        video_token_id = hf_config.video_token_id
+        vision_start_token_id = hf_config.vision_start_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(hf_config.vision_config,
+                                    "tokens_per_second", 1.0)
+
         if isinstance(image_grid_thw, torch.Tensor):
             image_grid_thw = image_grid_thw.tolist()
         if isinstance(video_grid_thw, torch.Tensor):
@@ -892,6 +895,7 @@ class MRotaryEmbedding(RotaryEmbedding):
 
         image_index, video_index = 0, 0
         for _ in range(image_nums + video_nums):
+            video_second_per_grid_t = 0.0
             if image_token_id in input_tokens and remain_images > 0:
                 ed_image = input_tokens.index(image_token_id, st)
             else:
@@ -915,9 +919,13 @@ class MRotaryEmbedding(RotaryEmbedding):
                     video_grid_thw[video_index][1],
                     video_grid_thw[video_index][2],
                 )
+                video_second_per_grid_t = 1.0
+                if second_per_grid_ts is not None:
+                    video_second_per_grid_t = second_per_grid_ts[video_index]
                 video_index += 1
                 remain_videos -= 1
                 ed = ed_video
+
             llm_grid_t, llm_grid_h, llm_grid_w = \
                 t, h // spatial_merge_size, w // spatial_merge_size
             text_len = ed - st
@@ -927,8 +935,10 @@ class MRotaryEmbedding(RotaryEmbedding):
             llm_pos_ids_list.append(
                 torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
 
-            t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
-                -1, llm_grid_h * llm_grid_w).flatten()
+            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
+                -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t *
+                       tokens_per_second).long().flatten()
+
             h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
                 llm_grid_t, -1, llm_grid_w).flatten()
             w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
new file mode 100644
index 000000000..e93cf46b9
--- /dev/null
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -0,0 +1,1133 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+# Copyright 2025 The vLLM team.
+# Copyright 2025 The Qwen Team.
+# Copyright 2025 The HuggingFace Inc. team.
+# All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
+from functools import cached_property, partial
+from typing import (Callable, Iterable, List, Literal, Mapping, Optional, Set,
+                    Tuple, TypedDict, Union)
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import BatchFeature
+from transformers.models.qwen2_5_vl import (Qwen2_5_VLImageProcessor,
+                                            Qwen2_5_VLProcessor)
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
+    Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
+
+from vllm.attention import AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalFieldConfig
+from vllm.platforms import _Backend
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import uses_mrope
+
+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
+from .qwen2_vl import Qwen2VLDummyInputsBuilder as Qwen2_5_VLDummyInputsBuilder
+from .qwen2_vl import (Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo,
+                       apply_rotary_pos_emb_vision)
+from .utils import (AutoWeightsLoader, WeightsMapper,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import get_vit_attn_backend
+
+logger = init_logger(__name__)
+
+# === Vision Inputs === #
+
+
+class Qwen2_5_VLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """Shape:
+    `(num_patches, num_channels * patch_size * patch_size)`
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+class Qwen2_5_VLImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    image_embeds: torch.Tensor
+    """Supported types:
+    - List[`torch.Tensor`]: A list of tensors holding all images' features.
+        Each tensor holds an image's features.
+    - `torch.Tensor`: A tensor holding all images' features
+        (concatenation of all images' feature tensors).
+
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on
+        the number and resolution of the images.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Qwen2_5_VLImageInputs = Union[Qwen2_5_VLImagePixelInputs,
+                              Qwen2_5_VLImageEmbeddingInputs]
+
+
+class Qwen2_5_VLVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    pixel_values_videos: torch.Tensor
+    """Shape:
+    `(num_patches,
+      num_channels * temporal_patch_size * patch_size * patch_size)`
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+    second_per_grid_ts: torch.Tensor
+    """
+    The video time interval (in seconds) for each grid along the temporal 
+    dimension in the 3D position IDs. Returned when `videos` is not `None`.
+    """
+
+
+class Qwen2_5_VLVideoEmbeddingInputs(TypedDict):
+    type: Literal["video_embeds"]
+    video_embeds: torch.Tensor
+    """Supported types:
+    - List[`torch.Tensor`]: A list of tensors holding all videos' features.
+        Each tensor holds an video's features.
+    - `torch.Tensor`: A tensor holding all videos' features
+      (concatenation of all videos' feature tensors).
+
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on
+        the number and resolution of the videos.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Qwen2_5_VLVideoInputs = Union[Qwen2_5_VLVideoPixelInputs,
+                              Qwen2_5_VLVideoEmbeddingInputs]
+
+# === Vision Encoder === #
+
+
+class Qwen2_5_VisionMLP(nn.Module):
+
+    def __init__(self,
+                 in_features: int,
+                 hidden_features: int,
+                 bias: bool = False,
+                 act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.gate_proj = ColumnParallelLinear(in_features,
+                                              hidden_features,
+                                              bias=bias,
+                                              quant_config=quant_config,
+                                              prefix=f"{prefix}.gate_proj")
+        self.up_proj = ColumnParallelLinear(in_features,
+                                            hidden_features,
+                                            bias=bias,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.up_proj")
+        self.down_proj = RowParallelLinear(hidden_features,
+                                           in_features,
+                                           bias=bias,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+        self.act_fn = act_fn
+
+    def forward(self, x: torch.Tensor):
+        x_gate, _ = self.gate_proj(x)
+        x_gate = self.act_fn(x_gate)
+        x_up, _ = self.up_proj(x)
+        x_down, _ = self.down_proj(x_gate * x_up)
+        return x_down
+
+
+class Qwen2_5_VisionAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        # Per attention head and per partition values.
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads)
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, world_size)
+
+        self.qkv = ColumnParallelLinear(input_size=embed_dim,
+                                        output_size=3 * projection_size,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.qkv")
+        self.proj = RowParallelLinear(input_size=projection_size,
+                                      output_size=embed_dim,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.proj")
+
+        # Detect attention implementation.
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+        if self.attn_backend not in {
+                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS
+        }:
+            raise RuntimeError(
+                f"Qwen2.5-VL does not support {self.attn_backend} backend now."
+            )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor,
+    ) -> torch.Tensor:
+        # [s, b, c] --> [s, b, head * 3 * head_dim]
+        x, _ = self.qkv(x)
+
+        # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim]
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            3 * self.hidden_size_per_attention_head,
+        )
+        x = x.view(*new_x_shape)
+
+        # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim]
+        q, k, v = dist_utils.split_tensor_along_last_dim(x, 3)
+        batch_size = q.shape[1]
+
+        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
+                   for x in (q, k, v))
+        if rotary_pos_emb is not None:
+            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            # from vllm_flash_attn.flash_attn_interface import (
+            #   flash_attn_varlen_func)
+            from flash_attn import flash_attn_varlen_func
+
+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+            output = flash_attn_varlen_func(q,
+                                            k,
+                                            v,
+                                            cu_seqlens_q=cu_seqlens,
+                                            cu_seqlens_k=cu_seqlens,
+                                            max_seqlen_q=max_seqlen,
+                                            max_seqlen_k=max_seqlen,
+                                            dropout_p=0,
+                                            causal=False)
+
+            context_layer = rearrange(output,
+                                      "(b s) ... -> b s ...",
+                                      b=batch_size)
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            seq_length = q.size(1)
+            q, k, v = (rearrange(x, "b s h d -> b h s d") for x in [q, k, v])
+            attention_mask = torch.zeros([1, seq_length, seq_length],
+                                         device=q.device,
+                                         dtype=torch.bool)
+            for i in range(1, len(cu_seqlens)):
+                attention_mask[..., cu_seqlens[i - 1]:cu_seqlens[i],
+                               cu_seqlens[i - 1]:cu_seqlens[i]] = True
+            output = F.scaled_dot_product_attention(q,
+                                                    k,
+                                                    v,
+                                                    attention_mask,
+                                                    dropout_p=0.0)
+            context_layer = rearrange(output, "b h s d -> b s h d ")
+        elif self.attn_backend == _Backend.XFORMERS:
+            from xformers import ops as xops
+            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
+                                                       kv_seqlen=None)
+
+            context_layer = xops.memory_efficient_attention_forward(
+                q, k, v, attn_bias=attn_bias, p=0, scale=None)
+        context_layer = rearrange(context_layer,
+                                  "b s h d -> s b (h d)").contiguous()
+
+        output, _ = self.proj(context_layer)
+        return output
+
+
+class Qwen2RMSNorm(nn.Module):
+
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance +
+                                                    self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Qwen2_5_VisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_hidden_dim: int,
+        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        self.attn = Qwen2_5_VisionAttention(embed_dim=dim,
+                                            num_heads=num_heads,
+                                            projection_size=dim,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.attn")
+        self.mlp = Qwen2_5_VisionMLP(dim,
+                                     mlp_hidden_dim,
+                                     act_fn=act_fn,
+                                     bias=True,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.mlp")
+
+    def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor,
+                rotary_pos_emb: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x),
+                          cu_seqlens=cu_seqlens,
+                          rotary_pos_emb=rotary_pos_emb)
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class Qwen2_5_VisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_channels: int = 3,
+        hidden_size: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.hidden_size = hidden_size
+
+        kernel_size = (temporal_patch_size, patch_size, patch_size)
+        self.proj = nn.Conv3d(in_channels,
+                              hidden_size,
+                              kernel_size=kernel_size,
+                              stride=kernel_size,
+                              bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size,
+                   self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
+        return x
+
+
+class Qwen2_5_VisionPatchMerger(nn.Module):
+
+    def __init__(
+        self,
+        d_model: int,
+        context_dim: int,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        spatial_merge_size: int = 2,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.ln_q = norm_layer(context_dim)
+        self.mlp = nn.ModuleList([
+            ColumnParallelLinear(self.hidden_size,
+                                 self.hidden_size,
+                                 bias=True,
+                                 quant_config=quant_config,
+                                 prefix=f"{prefix}.mlp.0"),
+            nn.GELU(),
+            RowParallelLinear(self.hidden_size,
+                              d_model,
+                              bias=True,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.mlp.2"),
+        ])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.ln_q(x)
+        x = x.view(-1, self.hidden_size)
+
+        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
+        x_parallel, _ = mlp_fc1(x)
+        x_parallel = mlp_act(x_parallel)
+        out, _ = mlp_fc2(x_parallel)
+        return out
+
+
+class Qwen2_5_VisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        inv_freq = 1.0 / (theta
+                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._seq_len_cached = 0
+        self._freqs_cached = None
+
+    def update_freqs_cache(self, seqlen: int) -> None:
+        if seqlen > self._seq_len_cached:
+            seqlen *= 2
+            self._seq_len_cached = seqlen
+            self.inv_freq = 1.0 / (self.theta**(torch.arange(
+                0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device)
+                                                / self.dim))
+            seq = torch.arange(seqlen,
+                               device=self.inv_freq.device,
+                               dtype=self.inv_freq.dtype)
+            freqs = torch.outer(seq, self.inv_freq)
+            self._freqs_cached = freqs
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        self.update_freqs_cache(seqlen)
+        return self._freqs_cached[:seqlen]
+
+
+class Qwen2_5_VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        vision_config: Qwen2_5_VLVisionConfig,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        patch_size = vision_config.patch_size
+        temporal_patch_size = vision_config.temporal_patch_size
+        in_channels = vision_config.in_channels
+        depth = vision_config.depth
+        self.hidden_size = vision_config.hidden_size
+        self.num_heads = vision_config.num_heads
+
+        # args for get_window_index
+        self.window_size = vision_config.window_size
+        self.patch_size = vision_config.patch_size
+        self.spatial_merge_size = vision_config.spatial_merge_size
+        self.fullatt_block_indexes = vision_config.fullatt_block_indexes
+        self.spatial_merge_unit = self.spatial_merge_size**2
+
+        self.patch_embed = Qwen2_5_VisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_channels=in_channels,
+            hidden_size=self.hidden_size,
+        )
+
+        # NOTE: We use torch native RMSNorm here for precision purposes.
+        norm_layer = partial(Qwen2RMSNorm, eps=norm_eps)
+        head_dim = self.hidden_size // self.num_heads
+        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList([
+            Qwen2_5_VisionBlock(
+                dim=self.hidden_size,
+                num_heads=self.num_heads,
+                mlp_hidden_dim=vision_config.intermediate_size,
+                act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
+                norm_layer=norm_layer,
+                quant_config=quant_config,
+                prefix=f"{prefix}.blocks.{layer_idx}")
+            for layer_idx in range(depth)
+        ])
+        self.merger = Qwen2_5_VisionPatchMerger(
+            d_model=vision_config.out_hidden_size,
+            context_dim=self.hidden_size,
+            norm_layer=norm_layer,
+            spatial_merge_size=self.spatial_merge_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.merger",
+        )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def get_window_index(self, grid_thw):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        vit_merger_window_size = (self.window_size //
+                                  self.spatial_merge_size // self.patch_size)
+
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h = grid_h // self.spatial_merge_size
+            llm_grid_w = grid_w // self.spatial_merge_size
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
+                grid_t, llm_grid_h, llm_grid_w)
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100)
+            index_padded = index_padded.reshape(grid_t, num_windows_h,
+                                                vit_merger_window_size,
+                                                num_windows_w,
+                                                vit_merger_window_size)
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t, num_windows_h * num_windows_w, vit_merger_window_size,
+                vit_merger_window_size)
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = seqlens.cumsum(
+                0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+        return window_index, cu_window_seqlens
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        # patchify
+        hidden_states = x.to(device=self.device, dtype=self.dtype)
+        hidden_states = self.patch_embed(hidden_states)
+
+        # compute position embedding
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+
+        # windows attention
+        window_index, cu_window_seqlens = self.get_window_index(grid_thw)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=hidden_states.device,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32)
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+        seq_len, _ = hidden_states.size()
+        hidden_states = hidden_states.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        hidden_states = hidden_states[window_index, :, :]
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        # compute cu_seqlens
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
+                                             grid_thw[:, 0]).cumsum(
+                                                 dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+
+        # transformers
+        hidden_states = hidden_states.unsqueeze(1)
+        for layer_num, blk in enumerate(self.blocks):
+            if layer_num in self.fullatt_block_indexes:
+                cu_seqlens_now = cu_seqlens
+            else:
+                cu_seqlens_now = cu_window_seqlens
+            hidden_states = blk(hidden_states,
+                                cu_seqlens=cu_seqlens_now,
+                                rotary_pos_emb=rotary_pos_emb)
+
+        # adapter
+        hidden_states = self.merger(hidden_states)
+        reverse_indices = torch.argsort(window_index)
+        hidden_states = hidden_states[reverse_indices, :]
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name.endswith("qkv.weight"):
+                    visual_num_heads = self.num_heads
+                    visual_embed_dim = self.hidden_size
+                    head_size = visual_embed_dim // visual_num_heads
+                    loaded_weight = loaded_weight.view(3, visual_num_heads,
+                                                       head_size,
+                                                       visual_embed_dim)
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                    loaded_weight = loaded_weight.reshape(-1, visual_embed_dim)
+                elif name.endswith("qkv.bias"):
+                    visual_num_heads = self.num_heads
+                    visual_embed_dim = self.hidden_size
+                    head_size = visual_embed_dim // visual_num_heads
+                    loaded_weight = loaded_weight.view(3, visual_num_heads,
+                                                       head_size)
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                    loaded_weight = loaded_weight.reshape(-1)
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Qwen2_5_VLConfig)
+
+    def get_hf_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        fps: Optional[float] = 2.0,
+    ) -> Qwen2_5_VLProcessor:
+        hf_processor = self.ctx.get_hf_processor(Qwen2_5_VLProcessor)
+        image_processor = hf_processor.image_processor  # type: ignore
+        assert isinstance(image_processor, Qwen2_5_VLImageProcessor)
+
+        if min_pixels:
+            image_processor.min_pixels = min_pixels
+        if max_pixels:
+            image_processor.max_pixels = max_pixels
+        if max_pixels or min_pixels:
+            image_processor.size = {
+                "min_pixels": image_processor.min_pixels,
+                "max_pixels": image_processor.max_pixels,
+            }
+
+        return hf_processor
+
+    def get_image_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        fps: Optional[float] = 2.0,
+    ) -> Qwen2_5_VLImageProcessor:
+        hf_processor = self.get_hf_processor(
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+            fps=fps,
+        )
+        image_processor = hf_processor.image_processor  # type: ignore
+        assert isinstance(image_processor, Qwen2_5_VLImageProcessor)
+        return image_processor
+
+
+class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            **super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs),
+            second_per_grid_ts=MultiModalFieldConfig.batched("video"),
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen2_5_VLMultiModalProcessor,
+    info=Qwen2_5_VLProcessingInfo,
+    dummy_inputs=Qwen2_5_VLDummyInputsBuilder)
+class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                         SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ]
+    }
+
+    # LoRA specific attributes, TODO: double check
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        "gate_proj"
+        "up_proj",
+        # vision tower
+        "qkv",
+        "attn.proj",  # Distinguish patch_embed.proj
+        "fc1",
+        "fc2",
+        # projector
+        "mlp.0",
+        "mlp.2"
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "lm_head.": "language_model.lm_head.",
+        "model.": "language_model.model.",
+    })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.visual = Qwen2_5_VisionTransformer(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+            quant_config=self._maybe_ignore_quant_config(quant_config),
+            prefix=maybe_prefix(prefix, "visual"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
+        # seems to avoid vision encoder sections for some models.
+        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return quant_config
+
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
+            return torch.concat(list(mm_input))
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Qwen2_5_VLImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return Qwen2_5_VLImagePixelInputs(type="pixel_values",
+                                              pixel_values=pixel_values,
+                                              image_grid_thw=image_grid_thw)
+
+        if image_embeds is not None:
+            image_embeds = self._validate_and_reshape_mm_tensor(
+                image_embeds, "image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+            return Qwen2_5_VLImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw)
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[Qwen2_5_VLVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+        second_per_grid_ts = kwargs.pop("second_per_grid_ts", None)
+
+        if pixel_values_videos is None and video_embeds is None:
+            return None
+
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos, "video pixel values")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return Qwen2_5_VLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+            )
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, "video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            if not isinstance(video_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+            return Qwen2_5_VLVideoEmbeddingInputs(
+                type="video_embeds",
+                video_embeds=video_embeds,
+                video_grid_thw=video_grid_thw)
+
+    def _process_image_input(
+            self,
+            image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+            image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+
+        # Split concatenated embeddings for each image item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return image_embeds.split(sizes.tolist())
+
+    def _process_video_input(
+            self,
+            video_input: Qwen2_5_VLVideoInputs) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        if video_input["type"] == "video_embeds":
+            video_embeds = video_input["video_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values_videos = video_input["pixel_values_videos"].type(
+                self.visual.dtype)
+            video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
+
+        # Split concatenated embeddings for each video item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return video_embeds.split(sizes.tolist())
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("pixel_values_videos",
+                             "video_embeds") and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+        return modalities
+
+    def get_multimodal_embeddings(
+            self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_video_input(video_input)
+                multimodal_embeddings += video_embeddings
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [self.config.image_token_id, self.config.video_token_id])
+        return inputs_embeds
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[tuple[torch.Tensor, ...]] = None,
+        video_input: Optional[tuple[torch.Tensor, ...]] = None,
+    ) -> torch.Tensor:
+
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=self.config.image_token_id,
+            )
+
+        if video_input is not None:
+            video_embeds = self._process_video_input(video_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                video_embeds,
+                placeholder_token_id=self.config.video_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for Qwen2.5-VL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for Qwen2.5-VL
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+            pixel_values: Pixel values to be fed to a model.
+                `None` if no images are passed.
+            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
+                `None` if no images are passed.
+            pixel_values_videos: Pixel values of videos to be fed to a model.
+                `None` if no videos are passed.
+            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
+                `None` if no videos are passed.
+            second_per_grid_ts: Tensor `(num_videos)` of video time interval (
+                in seconds) for each grid along the temporal dimension in the
+                3D position IDs. `None` if no videos are passed.
+        """
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            video_input = self._parse_and_validate_video_input(**kwargs)
+
+            if image_input is None and video_input is None:
+                inputs_embeds = None
+            else:
+                if uses_mrope(self.config):
+                    assert positions.ndim == 2 and positions.size(0) == 3, (
+                        "multimodal section rotary embedding requires "
+                        f"(3, seq_len) positions, but got {positions.size()}")
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    video_input=video_input)
+                input_ids = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="visual.",
+            tower_model="visual.merger.")
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 2b2638cf6..34ae7b8c9 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -650,8 +650,8 @@ class Qwen2VisionTransformer(nn.Module):
         return loaded_params
 
 
-class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
-                                            dict[str, torch.Tensor]]):
+class Qwen2VLEmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
+                                              dict[str, torch.Tensor]]):
 
     def __init__(self, data: dict, modality: str) -> None:
         super().__init__(data, modality)
@@ -683,26 +683,26 @@ class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
         return self.data
 
 
-class Qwen2ImageEmbeddingItems(Qwen2EmbeddingItems):
+class Qwen2VLImageEmbeddingItems(Qwen2VLEmbeddingItems):
 
     def __init__(self, data: dict) -> None:
         super().__init__(data, "image")
 
 
-class Qwen2VideoEmbeddingItems(Qwen2EmbeddingItems):
+class Qwen2VLVideoEmbeddingItems(Qwen2VLEmbeddingItems):
 
     def __init__(self, data: dict) -> None:
         super().__init__(data, "video")
 
 
-class Qwen2MultiModalDataParser(MultiModalDataParser):
+class Qwen2VLMultiModalDataParser(MultiModalDataParser):
 
     def _parse_image_data(
         self,
         data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
-            return Qwen2EmbeddingItems(data, modality="image")
+            return Qwen2VLEmbeddingItems(data, modality="image")
 
         return super()._parse_image_data(data)
 
@@ -711,7 +711,7 @@ class Qwen2MultiModalDataParser(MultiModalDataParser):
         data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
-            return Qwen2EmbeddingItems(data, modality="video")
+            return Qwen2VLEmbeddingItems(data, modality="video")
 
         return super()._parse_video_data(data)
 
@@ -948,7 +948,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
                                  ):
 
     def _get_data_parser(self) -> MultiModalDataParser:
-        return Qwen2MultiModalDataParser()
+        return Qwen2VLMultiModalDataParser()
 
     def _get_prompt_replacements(
         self,
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 962f95f10..b6708f77d 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -172,6 +172,7 @@ _MULTIMODAL_MODELS = {
     "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
+    "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),  # noqa: E501
     "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     # [Encoder-decoder]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7841fac1d..ec6d04cd4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -285,6 +285,7 @@ class GPUModelRunner:
             if self.model_config.uses_mrope:
                 image_grid_thw = []
                 video_grid_thw = []
+                second_per_grid_ts = []
                 for mm_input in self.requests[req_id].mm_inputs:
                     if mm_input.get("image_grid_thw") is not None:
                         image_grid_thw.extend(
@@ -292,6 +293,9 @@ class GPUModelRunner:
                     if mm_input.get("video_grid_thw") is not None:
                         video_grid_thw.extend(
                             mm_input["video_grid_thw"].tolist())
+                    if mm_input.get("second_per_grid_ts") is not None:
+                        second_per_grid_ts.extend(
+                            mm_input["second_per_grid_ts"])
 
                 hf_config = self.model_config.hf_config
 
@@ -299,14 +303,10 @@ class GPUModelRunner:
                     self.requests[req_id].mrope_position_delta = \
                     MRotaryEmbedding.get_input_positions_tensor(
                         self.requests[req_id].prompt_token_ids,
+                        hf_config=hf_config,
                         image_grid_thw=image_grid_thw,
                         video_grid_thw=video_grid_thw,
-                        image_token_id=hf_config.image_token_id,
-                        video_token_id=hf_config.video_token_id,
-                        vision_start_token_id=hf_config.vision_start_token_id,
-                        vision_end_token_id=hf_config.vision_end_token_id,
-                        spatial_merge_size=hf_config.vision_config.
-                        spatial_merge_size,
+                        second_per_grid_ts=second_per_grid_ts,
                     )
 
             req_ids_to_add.append(req_id)
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 1c3feece9..940089310 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -386,20 +386,17 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
                 "mrope embedding type requires multi-modal input mapper "
                 "returns 'image_grid_thw' or 'video_grid_thw'.")
 
+            second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
             hf_config = self.runner.model_config.hf_config
             token_ids = seq_data.get_token_ids()
 
             mrope_positions, mrope_position_delta = \
                 MRotaryEmbedding.get_input_positions(
                     token_ids,
+                    hf_config=hf_config,
                     image_grid_thw=image_grid_thw,
                     video_grid_thw=video_grid_thw,
-                    image_token_id=hf_config.image_token_id,
-                    video_token_id=hf_config.video_token_id,
-                    vision_start_token_id=hf_config.vision_start_token_id,
-                    vision_end_token_id=hf_config.vision_end_token_id,
-                    spatial_merge_size=hf_config.vision_config.
-                    spatial_merge_size,
+                    second_per_grid_ts=second_per_grid_ts,
                     context_len=computed_len,
                 )
             seq_data.mrope_position_delta = mrope_position_delta
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 0bbba55b3..12baecde6 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -702,6 +702,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
                 "mrope embedding type requires multi-modal input mapper "
                 "returns 'image_grid_thw' or 'video_grid_thw'.")
 
+            second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
             hf_config = self.runner.model_config.hf_config
 
             inter_data.mrope_input_positions = [None] * inter_data.n_seqs
@@ -713,14 +714,10 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
                 mrope_input_positions, mrope_position_delta = \
                     MRotaryEmbedding.get_input_positions(
                         token_ids,
+                        hf_config=hf_config,
                         image_grid_thw=image_grid_thw,
                         video_grid_thw=video_grid_thw,
-                        image_token_id=hf_config.image_token_id,
-                        video_token_id=hf_config.video_token_id,
-                        vision_start_token_id=hf_config.vision_start_token_id,
-                        vision_end_token_id=hf_config.vision_end_token_id,
-                        spatial_merge_size=hf_config.vision_config.
-                        spatial_merge_size,
+                        second_per_grid_ts=second_per_grid_ts,
                         context_len=inter_data.context_lens[seq_idx],
                         seq_len=inter_data.seq_lens[seq_idx],
                     )
-- 
GitLab


From 75404d041be0d6e656b59cbbea23520d47d37b66 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 6 Feb 2025 11:09:45 +0800
Subject: [PATCH 57/65] [VLM] Update compatibility with transformers 4.49

---
 docs/source/models/supported_models.md        |  3 +-
 examples/template_pixtral_hf.jinja            | 38 -------------------
 tests/entrypoints/test_chat_utils.py          |  1 -
 .../vision_language/test_models.py            |  4 +-
 .../vision_language/test_llava_next.py        |  7 ++--
 vllm/model_executor/models/llava.py           | 33 +++++++++++-----
 vllm/model_executor/models/llava_next.py      | 10 ++++-
 vllm/model_executor/models/minicpmv.py        |  9 +++++
 vllm/multimodal/inputs.py                     |  4 +-
 9 files changed, 50 insertions(+), 59 deletions(-)
 delete mode 100644 examples/template_pixtral_hf.jinja

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 3e8b2f896..ef7e77fa3 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -883,8 +883,7 @@ For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 :::
 
 :::{note}
-The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)).
-A corrected version is available at <gh-file:examples/template_pixtral_hf.jinja>.
+`mistral-community/pixtral-12b` does not support V1 yet.
 :::
 
 :::{note}
diff --git a/examples/template_pixtral_hf.jinja b/examples/template_pixtral_hf.jinja
deleted file mode 100644
index e94661cb3..000000000
--- a/examples/template_pixtral_hf.jinja
+++ /dev/null
@@ -1,38 +0,0 @@
-{%- if messages[0]["role"] == "system" %}
-    {%- set system_message = messages[0]["content"] %}
-    {%- set loop_messages = messages[1:] %}
-{%- else %}
-    {%- set loop_messages = messages %}
-{%- endif %}
-
-{{- bos_token }}
-{%- for message in loop_messages %}
-    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
-        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
-    {%- endif %}
-    {%- if message["role"] == "user" %}
-        {%- if loop.last and system_message is defined %}
-            {{- "[INST]" + system_message + "\n" }}
-        {%- else %}
-            {{- "[INST]" }}
-        {%- endif %}
-        {%- if message["content"] is not string %}
-            {%- for chunk in message["content"] %}
-                {%- if chunk["type"] == "text" %}
-                    {{- chunk["text"] }}
-                {%- elif chunk["type"] == "image" %}
-                    {{- "[IMG]" }}
-                {%- else %}
-                    {{- raise_exception("Unrecognized content type!") }}
-                {%- endif %}
-            {%- endfor %}
-        {%- else %}
-            {{- message["content"] }}
-        {%- endif %}
-        {{- "[/INST]" }}
-    {%- elif message["role"] == "assistant" %}
-        {{- message["content"] + eos_token}}
-    {%- else %}
-        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
-    {%- endif %}
-{%- endfor %}
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 737f73309..5c469007a 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -761,7 +761,6 @@ def test_resolve_content_format_hf_defined(model, expected_format):
      ("template_falcon.jinja", "string"),
      ("template_inkbot.jinja", "string"),
      ("template_llava.jinja", "string"),
-     ("template_pixtral_hf.jinja", "openai"),
      ("template_vlm2vec.jinja", "openai"),
      ("tool_chat_template_granite_20b_fc.jinja", "string"),
      ("tool_chat_template_hermes.jinja", "string"),
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 95505dcf5..b00ec6fa6 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -224,7 +224,7 @@ VLM_TEST_SETTINGS = {
         marks=[
             pytest.mark.skipif(
                 Version(TRANSFORMERS_VERSION) >= Version("4.48"),
-                reason="HF model is not compatible with transformers>=4.48.0",
+                reason="HF model is not compatible with transformers>=4.48",
             )
         ],
     ),
@@ -359,7 +359,7 @@ VLM_TEST_SETTINGS = {
         marks=[
             pytest.mark.skipif(
                 Version(TRANSFORMERS_VERSION) >= Version("4.48"),
-                reason="HF model is not compatible with transformers>=4.48.0",
+                reason="HF model is not compatible with transformers>=4.48",
             )
         ],
     ),
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index 6ba3c5403..990c6c150 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -4,7 +4,6 @@ from typing import List, Type
 
 import pytest
 import torch.nn.functional as F
-import transformers
 from transformers import AutoModelForVision2Seq
 
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
@@ -57,6 +56,10 @@ def _run_test(
 
     with hf_runner(model, dtype=dtype,
                    auto_cls=AutoModelForVision2Seq) as hf_model:
+        # Patch the issue where generation_config.json is missing
+        hf_model.processor.patch_size = \
+            hf_model.model.config.vision_config.patch_size
+
         # Patch the issue where image_token_id
         # exceeds the maximum allowed vocab size
         hf_model.model.resize_token_embeddings(
@@ -88,8 +91,6 @@ def _run_test(
     )
 
 
-@pytest.mark.skipif(transformers.__version__ >= "4.46",
-                    reason="Model broken with changes in transformers 4.46")
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 63d308ef6..b1fee3eeb 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -293,16 +293,29 @@ class PixtralHFMultiModalProcessor(
 
         pixel_values = processed_outputs.get("pixel_values")
         if pixel_values is not None:
-            images = mm_data["images"]
-            assert isinstance(images, list)
-
-            # Original output: (1, num_images, C, H, W)
-            # New output: (num_images, C, H, W)
-            assert (isinstance(pixel_values, list) and len(pixel_values) == 1)
-            assert (isinstance(pixel_values[0], list)
-                    and len(pixel_values[0]) == len(images))
-
-            processed_outputs["pixel_values"] = pixel_values[0]
+            # Before/after https://github.com/huggingface/transformers/pull/35122
+            if Version(TRANSFORMERS_VERSION) <= Version("4.48.2"):
+                images = mm_data["images"]
+                assert isinstance(images, list)
+
+                # Original output: (1, num_images, C, H, W)
+                # New output: (num_images, C, H, W)
+                assert (isinstance(pixel_values, list)
+                        and len(pixel_values) == 1)
+                assert (isinstance(pixel_values[0], list)
+                        and len(pixel_values[0]) == len(images))
+
+                processed_outputs["pixel_values"] = pixel_values[0]
+            else:
+                # Avoid padding since we need the output for each image to be
+                # independent of other images for the cache to work correctly
+                image_sizes = processed_outputs["image_sizes"]
+                assert len(pixel_values) == len(image_sizes)
+
+                processed_outputs["pixel_values"] = [
+                    p[:, :h, :w]
+                    for p, (h, w) in zip(pixel_values, image_sizes)
+                ]
 
         return processed_outputs
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index defdeb54a..719916642 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -73,7 +73,15 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
         return self.ctx.get_hf_config(LlavaNextConfig)
 
     def get_hf_processor(self):
-        return self.ctx.get_hf_processor(LlavaNextProcessor)
+        hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor)
+
+        # In case patch_size is omitted from `processor_config.json`
+        # e.g. for E5-V: https://huggingface.co/royokong/e5-v
+        if hf_processor.patch_size is None:
+            patch_size = self.get_vision_encoder_info().get_patch_size()
+            hf_processor.patch_size = patch_size
+
+        return hf_processor
 
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L113
     def get_num_image_tokens(
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 20f3a3d19..58a4448d4 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -342,6 +342,15 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
         **kwargs: object,
     ):
         hf_processor = self.ctx.get_hf_processor()
+
+        # NumPy arrays are considered as Iterable but not Sequence in
+        # https://github.com/huggingface/transformers/blob/main/src/transformers/image_transforms.py#L428
+        image_processor = hf_processor.image_processor  # type: ignore
+        for attr in ("mean", "std"):
+            val = getattr(image_processor, attr)
+            if isinstance(val, np.ndarray):
+                setattr(image_processor, attr, val.tolist())
+
         return hf_processor
 
     def get_image_processor(self):
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 2f2535f36..5f9593ee8 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -141,9 +141,9 @@ Uses a list instead of a tensor if the dimensions of each element do not match.
 def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
     """Equality check between :data:`NestedTensors` objects."""
     if isinstance(a, torch.Tensor):
-        return isinstance(b, torch.Tensor) and bool((a == b).all().item())
+        return isinstance(b, torch.Tensor) and torch.equal(a, b)
     elif isinstance(b, torch.Tensor):
-        return isinstance(a, torch.Tensor) and bool((b == a).all().item())
+        return isinstance(a, torch.Tensor) and torch.equal(b, a)
 
     if isinstance(a, list):
         return (isinstance(b, list)
-- 
GitLab


From 5b19b93082fc5ad0ce33752d8467337cbe93de21 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Wed, 5 Feb 2025 22:15:08 -0500
Subject: [PATCH 58/65] [ROCm][Kernel] Using the correct warp_size value

---
 csrc/moe/moe_align_sum_kernels.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index ff74a42d7..01dac4044 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -207,8 +207,8 @@ __global__ void sgl_moe_align_block_size_kernel(
   __shared__ int32_t shared_counts[32][8];
   __shared__ int32_t local_offsets[256];
 
-  const int warp_id = threadIdx.x / WARP_SIZE;
-  const int lane_id = threadIdx.x % WARP_SIZE;
+  const int warp_id = threadIdx.x / 32;
+  const int lane_id = threadIdx.x % 32;
   const int experts_per_warp = 8;
   const int my_expert_start = warp_id * experts_per_warp;
 
-- 
GitLab


From 76abd0c88143419826bfc13d2cd29669d0fdfa1b Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 5 Feb 2025 22:22:19 -0500
Subject: [PATCH 59/65] [Bugfix] Better FP8 supported defaults

---
 .../layers/quantization/utils/fp8_utils.py    | 28 +++++++++++--------
 .../layers/quantization/utils/w8a8_utils.py   |  6 +++-
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 10ff71e57..99fbda314 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -15,7 +15,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     _normalize_quant_group_shape, scaled_dequantize)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear)
+    CUTLASS_BLOCK_FP8_SUPPORTED, CUTLASS_FP8_SUPPORTED, apply_fp8_linear)
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
@@ -38,7 +38,7 @@ def apply_w8a8_block_fp8_linear(
     weight_scale: torch.Tensor,
     input_scale: Optional[torch.Tensor] = None,
     bias: Optional[torch.Tensor] = None,
-    cutlass_block_fp8_supported: bool = True,
+    cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
 ) -> torch.Tensor:
     assert input_scale is None
     # View input as 2D matrix for fp8 methods
@@ -85,12 +85,14 @@ def apply_w8a8_block_fp8_linear(
 # `apply_fp8_linear`
 # NOTE(lucas): this is quite messy, we should think through this more formally
 def apply_fp8_linear_generic(
-        input: torch.Tensor,
-        weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        input_group_shape: Tuple[int, int],
-        weight_group_shape: Tuple[int, int],
-        input_scale: Optional[torch.Tensor] = None,  # static scale if one
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_group_shape: Tuple[int, int],
+    weight_group_shape: Tuple[int, int],
+    input_scale: Optional[torch.Tensor] = None,  # static scale if one
+    cutlass_fp8_supported: bool = CUTLASS_FP8_SUPPORTED,
+    cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
 ) -> torch.Tensor:
     # View input as 2D matrix for fp8 methods
     input = input.view(-1, input.shape[-1])
@@ -105,14 +107,18 @@ def apply_fp8_linear_generic(
     if is_dim_blocked(0, weight.shape, weight_group_shape[0])\
      and is_dim_blocked(1, weight.shape, weight_group_shape[1]) and\
      input_group_shape == (1, weight_group_shape[1]):
-        return apply_w8a8_block_fp8_linear(input, weight,
-                                           list(weight_group_shape),
-                                           weight_scale)
+        return apply_w8a8_block_fp8_linear(
+            input,
+            weight,
+            list(weight_group_shape),
+            weight_scale,
+            cutlass_block_fp8_supported=cutlass_block_fp8_supported)
     else:
         # Despite having linear in the it doesn't conform to
         # `torch.nn.functional.linear` which is defined as `input @ weight.T`
         # so we explicitly transpose the weight matrix here
         return apply_fp8_linear(input, weight.T, weight_scale.T,
+                    cutlass_fp8_supported=cutlass_fp8_supported,
                          use_per_token_if_dynamic=\
                              (input_group_shape == (1, input.shape[1])))
 
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 3fd88e875..dedeb0c29 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -42,6 +42,10 @@ def cutlass_block_fp8_supported() -> bool:
     return ops.cutlass_scaled_mm_supports_block_fp8(capability)
 
 
+CUTLASS_FP8_SUPPORTED = cutlass_fp8_supported()
+CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported()
+
+
 def per_tensor_dequantize(
         tensor: torch.Tensor, inv_scale: Union[float,
                                                torch.Tensor]) -> torch.Tensor:
@@ -109,7 +113,7 @@ def apply_fp8_linear(
     input_scale: Optional[torch.Tensor] = None,
     input_scale_ub: Optional[torch.Tensor] = None,
     bias: Optional[torch.Tensor] = None,
-    cutlass_fp8_supported: bool = True,
+    cutlass_fp8_supported: bool = CUTLASS_FP8_SUPPORTED,
     use_per_token_if_dynamic: bool = False,
 ) -> torch.Tensor:
     # ops.scaled_fp8_quant supports both dynamic and static quant.
-- 
GitLab


From 9cdea30b4fe0ebd23847371f51ea0a48b9615847 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Wed, 5 Feb 2025 19:23:35 -0800
Subject: [PATCH 60/65] [Misc][Easy] Remove the space from the file name

---
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 vllm/model_executor/layers/fused_moe/fused_moe.py               | 2 +-
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 vllm/model_executor/layers/quantization/utils/fp8_utils.py      | 2 +-
 42 files changed, 2 insertions(+), 2 deletions(-)
 rename vllm/model_executor/layers/fused_moe/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/fused_moe/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 1bed35525..f14200e02 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -765,7 +765,7 @@ def get_config_file_name(E: int,
     device_name = current_platform.get_device_name().replace(" ", "_")
     dtype_selector = "" if not dtype else f",dtype={dtype}"
     block_shape_selector = ("" if not block_shape or not all(block_shape) else
-                            f",block_shape={block_shape}")
+                            f",block_shape={block_shape}").replace(" ", "")
     return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json"  # noqa: E501
 
 
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 99fbda314..9895537c2 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -423,7 +423,7 @@ def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int,
     # First look up if an optimized configuration is available in the configs
     # directory
     device_name = current_platform.get_device_name().replace(" ", "_")
-    json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n}, {block_k}].json"  # noqa: E501
+    json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n},{block_k}].json"  # noqa: E501
 
     config_file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
-- 
GitLab


From d88506dda45f69cf1f56c4325282c2a881eaaaf7 Mon Sep 17 00:00:00 2001
From: Sumit Vij <sumitvij11+github@gmail.com>
Date: Wed, 5 Feb 2025 19:54:13 -0800
Subject: [PATCH 61/65] [Model] LoRA Support for Ultravox model (#11253)

---
 docs/source/models/supported_models.md |   2 +-
 tests/conftest.py                      |  16 +++-
 tests/lora/test_ultravox.py            | 121 +++++++++++++++++++++++++
 vllm/model_executor/models/ultravox.py |  28 +++++-
 4 files changed, 160 insertions(+), 7 deletions(-)
 create mode 100644 tests/lora/test_ultravox.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index ef7e77fa3..32f3e9def 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -857,7 +857,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * Ultravox
   * T + A<sup>E+</sup>
   * `fixie-ai/ultravox-v0_3`
-  *
+  * ✅︎
   * ✅︎
   * ✅︎
 :::
diff --git a/tests/conftest.py b/tests/conftest.py
index 85dd5bcb0..02105900f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -737,6 +737,7 @@ class VllmRunner:
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        **kwargs: Any,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         inputs = self.get_inputs(prompts,
                                  images=images,
@@ -744,7 +745,8 @@ class VllmRunner:
                                  audios=audios)
 
         req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params)
+                                          sampling_params=sampling_params,
+                                          **kwargs)
 
         outputs: List[Tuple[List[List[int]], List[str]]] = []
         for req_output in req_outputs:
@@ -782,6 +784,7 @@ class VllmRunner:
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
         videos: Optional[PromptVideoInput] = None,
+        **kwargs: Any,
     ) -> Union[List[TokensTextLogprobs],
                List[TokensTextLogprobsPromptLogprobs]]:
         inputs = self.get_inputs(prompts,
@@ -790,7 +793,8 @@ class VllmRunner:
                                  audios=audios)
 
         req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params)
+                                          sampling_params=sampling_params,
+                                          **kwargs)
 
         toks_str_logsprobs_prompt_logprobs = (
             self._final_steps_generate_w_logprobs(req_outputs))
@@ -826,13 +830,15 @@ class VllmRunner:
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        **kwargs: Any,
     ) -> List[Tuple[List[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
         outputs = self.generate(prompts,
                                 greedy_params,
                                 images=images,
                                 videos=videos,
-                                audios=audios)
+                                audios=audios,
+                                **kwargs)
         return [(output_ids[0], output_str[0])
                 for output_ids, output_str in outputs]
 
@@ -847,6 +853,7 @@ class VllmRunner:
         videos: Optional[PromptVideoInput] = None,
         stop_token_ids: Optional[List[int]] = None,
         stop: Optional[List[str]] = None,
+        **kwargs: Any,
     ) -> Union[List[TokensTextLogprobs],
                List[TokensTextLogprobsPromptLogprobs]]:
         greedy_logprobs_params = SamplingParams(
@@ -861,7 +868,8 @@ class VllmRunner:
                                         greedy_logprobs_params,
                                         images=images,
                                         audios=audios,
-                                        videos=videos)
+                                        videos=videos,
+                                        **kwargs)
 
     def generate_encoder_decoder_greedy_logprobs(
         self,
diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py
new file mode 100644
index 000000000..1218dfa34
--- /dev/null
+++ b/tests/lora/test_ultravox.py
@@ -0,0 +1,121 @@
+import shutil
+from os import path
+from tempfile import TemporaryDirectory
+from typing import List, Tuple
+
+import torch
+from huggingface_hub import snapshot_download
+from safetensors.torch import load_file, save_file
+from transformers import AutoTokenizer
+
+from vllm.lora.request import LoRARequest
+
+from ..models.utils import check_outputs_equal
+
+ULTRAVOX_MODEL_NAME = "fixie-ai/ultravox-v0_3"
+LLMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
+
+VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
+
+PROMPT = "Tell me about a Fool's mate move in 20 words. Provide the moves!"
+
+
+def llama3_1_8b_chess_lora_path():
+    return snapshot_download(
+        repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b")
+
+
+# can't use llama lora adapter without module name transformation
+# because ultravox nest language model
+def transform_module_names_for_ultravox(state_dict):
+    transformed_state_dict = {}
+    for key, value in state_dict.items():
+        new_key = key.replace("base_model.model",
+                              "base_model.model.language_model")
+        transformed_state_dict[new_key] = value
+    return transformed_state_dict
+
+
+def mk_llama3_1_8b_ultravox_chess_lora(source_repo, target_path):
+    tensor_file = "adapter_model.safetensors"
+    state_dict = load_file(path.join(source_repo, tensor_file))
+    transformed_state_dict = transform_module_names_for_ultravox(state_dict)
+
+    save_file(transformed_state_dict, path.join(target_path, tensor_file))
+
+    config_file = "adapter_config.json"
+    shutil.copyfile(path.join(source_repo, config_file),
+                    path.join(target_path, config_file))
+    return target_path
+
+
+def _get_prompt(audio_count, question, placeholder, model_name) -> str:
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    placeholder = f"{placeholder}\n" * audio_count
+
+    return tokenizer.apply_chat_template([{
+        'role': 'user',
+        'content': f"{placeholder}{question}"
+    }],
+                                         tokenize=False,
+                                         add_generation_prompt=True)
+
+
+def test_ultravox_lora(vllm_runner):
+    """
+    TODO: Train an Ultravox LoRA instead of using a Llama LoRA.
+    """
+    # Workaround to prevent device mismatch in Whisper.
+    # Can be removed when it is fixed upstream in transformer
+    # https://github.com/huggingface/transformers/pull/35866
+    torch.set_default_device("cpu")
+
+    llama3_1_8b_chess_lora = llama3_1_8b_chess_lora_path()
+    with TemporaryDirectory() as temp_ultravox_lora_dir:
+        llama3_1_8b_ultravox_chess_lora = mk_llama3_1_8b_ultravox_chess_lora(
+            llama3_1_8b_chess_lora, temp_ultravox_lora_dir)
+        with vllm_runner(
+                ULTRAVOX_MODEL_NAME,
+                enforce_eager=True,
+                max_num_seqs=2,
+                enable_lora=True,
+                max_loras=1,
+                max_lora_rank=128,
+                dtype="bfloat16",
+                max_model_len=1024,
+        ) as vllm_model:
+            ultravox_outputs: List[Tuple[
+                List[int], str]] = vllm_model.generate_greedy(
+                    [
+                        _get_prompt(0, PROMPT, VLLM_PLACEHOLDER,
+                                    ULTRAVOX_MODEL_NAME)
+                    ],
+                    256,
+                    lora_request=LoRARequest(str(1), 1,
+                                             llama3_1_8b_ultravox_chess_lora),
+                )
+
+    # run llama with and without lora to compare outputs with above
+    with vllm_runner(
+            LLMA_MODEL_NAME,
+            enforce_eager=True,
+            max_num_seqs=2,
+            enable_lora=True,
+            max_loras=1,
+            max_lora_rank=128,
+            dtype="bfloat16",
+            max_model_len=1024,
+    ) as vllm_model:
+        llama_outputs: List[Tuple[List[int], str]] = (
+            vllm_model.generate_greedy(
+                [_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)],
+                256,
+                lora_request=LoRARequest(str(1), 1, llama3_1_8b_chess_lora),
+            ))
+
+    check_outputs_equal(
+        outputs_0_lst=ultravox_outputs,
+        outputs_1_lst=llama_outputs,
+        name_0="ultravox",
+        name_1="llama",
+    )
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 52a4d798f..9da0682cf 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -22,6 +22,7 @@ from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
@@ -33,7 +34,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings,
@@ -343,7 +344,20 @@ class ModifiedWhisperEncoder(WhisperEncoder):
     UltravoxMultiModalProcessor,
     info=UltravoxProcessingInfo,
     dummy_inputs=UltravoxDummyInputsBuilder)
-class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
+class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    # LoRA specific attributes
+    # TODO : Add LoRA to the audio tower and projector.
+    supported_lora_modules = [
+        "qkv_proj", "o_proj", "gate_up_proj", "down_proj"
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
 
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
@@ -391,6 +405,16 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
         return get_sampler()
 
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model.",
+            connector="multi_modal_projector.",
+            tower_model="audio_tower.",
+        )
+
     def _audio_features_to_embeddings(
             self, input_features: torch.Tensor) -> torch.Tensor:
         audio_input = input_features.to(self.audio_tower.dtype)
-- 
GitLab


From 56534cd577211c563b2c5b74098929b949fc4063 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Wed, 5 Feb 2025 21:25:54 -0800
Subject: [PATCH 62/65] [Bugfix] Fix the test_ultravox.py's license (#12806)

Signed-off-by: Lu Fang <lufang@fb.com>
---
 tests/lora/test_ultravox.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py
index 1218dfa34..703f92ce8 100644
--- a/tests/lora/test_ultravox.py
+++ b/tests/lora/test_ultravox.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import shutil
 from os import path
 from tempfile import TemporaryDirectory
-- 
GitLab


From 1a6fcad4c933c89b4060a37d807a7f9e5a680cf3 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 6 Feb 2025 06:24:57 +0000
Subject: [PATCH 63/65] Improve `TransformersModel` UX (#12785)

---
 vllm/model_executor/models/transformers.py | 53 +++++++++++++---------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index dfc714382..43d2c88d3 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 """Wrapper around `transformers` models"""
 import re
-from typing import Iterable, Optional, Union
+from typing import Iterable, Literal, Optional, Union
 
 import torch
 from torch import nn
@@ -72,15 +72,24 @@ def vllm_flash_attention_forward(
 ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward
 
 
+def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
+    logger.debug("%s: %s -> %s", name, old_module, new_module)
+
+
 def replace_linear_class(
         linear: nn.Linear,
-        style: str,
+        style: Literal["colwise", "rowwise"],
         quant_config=None) -> Union[ColumnParallelLinear, RowParallelLinear]:
     """
-    In model configurations, we use a neutral type (string) to specify parallel
-    styles, here we use it to translate nn.Linear into vllm-style tp Linear.
-
-    Quant config is not supported yet
+    Replace nn.Linear with one of vLLM's tensor parallel linear classes.
+    
+    `quant_config` is not yet supported.
+    Args:
+        linear (nn.Linear): `nn.Linear` to be replaced.
+        style (str): Tensor parallel style of the new linear, e.g. "colwise".
+        quant_config (QuantConfig): Quantization config for the new linear.
+    Returns:
+        Union[ColumnParallelLinear, RowParallelLinear]: The new linear.
     """
 
     if not isinstance(style, str):
@@ -93,7 +102,10 @@ def replace_linear_class(
     }.get(style)
 
     if vllm_linear_cls is None:
-        raise ValueError(f"Unsupported parallel style value: {style}")
+        logger.warning(
+            "Unsupported parallel style value: %s. "
+            "This layer will not be tensor parallelized.", style)
+        return linear
 
     class HFCompatibleLinear(vllm_linear_cls):
         """
@@ -119,25 +131,24 @@ class TransformersModel(nn.Module):
         super().__init__()
         logger.info("Using Transformers backend.")
 
-        self.vllm_config = vllm_config
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        self.quant_config = quant_config
+
         self.config = config
+        self.quant_config = quant_config
         self.vocab_size = config.vocab_size
         self.unpadded_vocab_size = config.vocab_size
 
         self.model: PreTrainedModel = AutoModel.from_config(
             self.config,
             attn_implementation="vllm",
-            torch_dtype=vllm_config.model_config.dtype,
             trust_remote_code=vllm_config.model_config.trust_remote_code,
         )
         prefix = self.model.base_model_prefix
 
         # MLP modifications
-        self.tensor_parallelize(self.model)
+        self.apply_base_model_tp_plan(self.model)
 
         # Attention modifications (assumes 1 attention op per hidden layer)
         tp_size = get_tensor_model_parallel_world_size()
@@ -170,13 +181,13 @@ class TransformersModel(nn.Module):
                                                 config.vocab_size, logit_scale)
         self.sampler = get_sampler()
 
-    def log_replacement(self, name: str, old_module: nn.Module,
-                        new_module: nn.Module):
-        logger.debug("%s: %s -> %s", name, old_module, new_module)
-
-    def tensor_parallelize(self, module: nn.Module, prefix: str = ""):
+    def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""):
+        """
+        Apply the base model tensor parallelization plan to a module.
+        Currently only supports linear layers.
+        """
         if (self.config.base_model_tp_plan is None
-                and self.vllm_config.parallel_config.tensor_parallel_size > 1):
+                and get_tensor_model_parallel_world_size() > 1):
             raise ValueError(
                 "Trying to run tensor parallelization but the model does not "
                 "support it yet!")
@@ -189,9 +200,9 @@ class TransformersModel(nn.Module):
                     new_module = replace_linear_class(child_module, style,
                                                       self.quant_config)
                     setattr(module, child_name, new_module)
-                    self.log_replacement(qual_name, child_module, new_module)
+                    log_replacement(qual_name, child_module, new_module)
             else:
-                self.tensor_parallelize(child_module, prefix=qual_name)
+                self.apply_base_model_tp_plan(child_module, prefix=qual_name)
 
     def replace_vocab_embed_class(self, module: nn.Module):
         # Use native set input embeddings
@@ -201,8 +212,8 @@ class TransformersModel(nn.Module):
             org_num_embeddings=self.config.vocab_size,
             quant_config=None,
         )
-        self.log_replacement("input embedding",
-                             self.model.get_input_embeddings(), new_module)
+        log_replacement("input embedding", self.model.get_input_embeddings(),
+                        new_module)
         self.model.set_input_embeddings(new_module)
 
     def forward(
-- 
GitLab


From 449d1bce029f87e0d1cf3f30483687ff659268f2 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 6 Feb 2025 02:16:20 -0500
Subject: [PATCH 64/65] [Misc] Remove duplicated DeepSeek V2/V3 model
 definition (#12793)

---
 vllm/config.py                            |   1 -
 vllm/model_executor/models/deepseek_v2.py |  48 +-
 vllm/model_executor/models/deepseek_v3.py | 806 ----------------------
 vllm/model_executor/models/registry.py    |   2 +-
 4 files changed, 36 insertions(+), 821 deletions(-)
 delete mode 100644 vllm/model_executor/models/deepseek_v3.py

diff --git a/vllm/config.py b/vllm/config.py
index bc4bf627b..9ba497576 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -754,7 +754,6 @@ class ModelConfig:
 
     @property
     def is_deepseek_mla(self) -> bool:
-        # TODO add deepseek_v3
         return (hasattr(self.hf_text_config, "model_type")) \
                 and (self.hf_text_config.model_type in \
                     ('deepseek_v2', 'deepseek_v3'))\
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index fdd584f9d..773f5abe7 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -21,7 +21,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only DeepseekV2 model."""
+"""Inference-only DeepseekV2/DeepseekV3 model."""
 from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
@@ -115,23 +115,32 @@ class DeepseekV2MoE(nn.Module):
             raise ValueError(f"Unsupported activation: {config.hidden_act}. "
                              "Only silu is supported for now.")
 
-        self.experts = FusedMoE(num_experts=config.n_routed_experts,
-                                top_k=config.num_experts_per_tok,
-                                hidden_size=config.hidden_size,
-                                intermediate_size=config.moe_intermediate_size,
-                                reduce_results=False,
-                                renormalize=config.norm_topk_prob,
-                                quant_config=quant_config,
-                                use_grouped_topk=True,
-                                num_expert_group=config.n_group,
-                                topk_group=config.topk_group,
-                                prefix=f"{prefix}.experts")
-
         self.gate = ReplicatedLinear(config.hidden_size,
                                      config.n_routed_experts,
                                      bias=False,
                                      quant_config=None,
                                      prefix=f"{prefix}.gate")
+        if config.topk_method == "noaux_tc":
+            self.gate.e_score_correction_bias = nn.Parameter(
+                torch.empty(config.n_routed_experts))
+        else:
+            self.gate.e_score_correction_bias = None
+
+        self.experts = FusedMoE(
+            num_experts=config.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            prefix=f"{prefix}.experts",
+            scoring_func=config.scoring_func,
+            e_score_correction_bias=self.gate.e_score_correction_bias)
+
         if config.n_shared_experts is not None:
             intermediate_size = (config.moe_intermediate_size *
                                  config.n_shared_experts)
@@ -732,6 +741,15 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
+
+            # TODO(simon): support nextn predict layers
+            if hasattr(self.config, "num_nextn_predict_layers"
+                       ) and self.config.num_nextn_predict_layers > 0:
+                assert self.config.num_nextn_predict_layers == 1
+                layer_idx = self.config.num_hidden_layers
+                if name.startswith(f"model.layers.{layer_idx}"):
+                    continue
+
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
@@ -793,3 +811,7 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
+
+
+class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
+    pass
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
deleted file mode 100644
index 81f82b182..000000000
--- a/vllm/model_executor/models/deepseek_v3.py
+++ /dev/null
@@ -1,806 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only DeepseekV3 model."""
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
-
-import torch
-from torch import nn
-from transformers import PretrainedConfig
-
-from vllm.attention import Attention, AttentionMetadata
-from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, ModelConfig, VllmConfig
-from vllm.distributed import (get_pp_group,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import FusedMoE
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
-
-from .interfaces import SupportsPP
-from .utils import (PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
-
-
-class DeepseekV3MLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
-        reduce_results: bool = True,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           reduce_results=reduce_results,
-                                           prefix=f"{prefix}.down_proj")
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class DeepseekV3MoE(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
-        super().__init__()
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.routed_scaling_factor = config.routed_scaling_factor
-        self.n_shared_experts = config.n_shared_experts
-        self.routed_scaling_factor = config.routed_scaling_factor
-        if self.tp_size > config.n_routed_experts:
-            raise ValueError(
-                f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {config.n_routed_experts}.")
-
-        if config.hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
-                             "Only silu is supported for now.")
-
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     config.n_routed_experts,
-                                     bias=False,
-                                     quant_config=None,
-                                     prefix=f"{prefix}.gate")
-        if config.topk_method == "noaux_tc":
-            self.gate.e_score_correction_bias = nn.Parameter(
-                torch.empty(config.n_routed_experts))
-        else:
-            self.gate.e_score_correction_bias = None
-
-        self.experts = FusedMoE(
-            num_experts=config.n_routed_experts,
-            top_k=config.num_experts_per_tok,
-            hidden_size=config.hidden_size,
-            intermediate_size=config.moe_intermediate_size,
-            reduce_results=False,
-            renormalize=config.norm_topk_prob,
-            quant_config=quant_config,
-            use_grouped_topk=True,
-            num_expert_group=config.n_group,
-            topk_group=config.topk_group,
-            prefix=f"{prefix}.experts",
-            scoring_func=config.scoring_func,
-            e_score_correction_bias=self.gate.e_score_correction_bias)
-
-        if config.n_shared_experts is not None:
-            intermediate_size = (config.moe_intermediate_size *
-                                 config.n_shared_experts)
-            self.shared_experts = DeepseekV3MLP(
-                hidden_size=config.hidden_size,
-                intermediate_size=intermediate_size,
-                hidden_act=config.hidden_act,
-                quant_config=quant_config,
-                reduce_results=False,
-            )
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        num_tokens, hidden_dim = hidden_states.shape
-        hidden_states = hidden_states.view(-1, hidden_dim)
-        if self.n_shared_experts is not None:
-            shared_output = self.shared_experts(hidden_states)
-        # router_logits: (num_tokens, n_experts)
-        router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(
-            hidden_states=hidden_states,
-            router_logits=router_logits) * self.routed_scaling_factor
-        if shared_output is not None:
-            final_hidden_states = final_hidden_states + shared_output
-        if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
-
-        return final_hidden_states.view(num_tokens, hidden_dim)
-
-
-def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
-    import math
-    if scale <= 1:
-        return 1.0
-    return 0.1 * mscale * math.log(scale) + 1.0
-
-
-class DeepseekV3Attention(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        hidden_size: int,
-        num_heads: int,
-        qk_nope_head_dim: int,
-        qk_rope_head_dim: int,
-        v_head_dim: int,
-        q_lora_rank: int,
-        kv_lora_rank: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.qk_nope_head_dim = qk_nope_head_dim
-        self.qk_rope_head_dim = qk_rope_head_dim
-        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
-        self.v_head_dim = v_head_dim
-        self.q_lora_rank = q_lora_rank
-        self.kv_lora_rank = kv_lora_rank
-        self.num_heads = num_heads
-        tp_size = get_tensor_model_parallel_world_size()
-        assert num_heads % tp_size == 0
-        self.num_local_heads = num_heads // tp_size
-        self.scaling = self.qk_head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        if self.q_lora_rank is not None:
-            self.q_a_proj = ReplicatedLinear(self.hidden_size,
-                                             self.q_lora_rank,
-                                             bias=False,
-                                             quant_config=quant_config,
-                                             prefix=f"{prefix}.q_a_proj")
-            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
-                                         eps=config.rms_norm_eps)
-            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
-                                                 self.num_heads *
-                                                 self.qk_head_dim,
-                                                 bias=False,
-                                                 quant_config=quant_config,
-                                                 prefix=f"{prefix}.q_b_proj")
-        else:
-            self.q_proj = ColumnParallelLinear(self.hidden_size,
-                                               self.num_heads *
-                                               self.qk_head_dim,
-                                               bias=False,
-                                               quant_config=quant_config,
-                                               prefix=f"{prefix}.q_proj")
-
-        self.kv_a_proj_with_mqa = ReplicatedLinear(
-            self.hidden_size,
-            self.kv_lora_rank + self.qk_rope_head_dim,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.kv_a_proj_with_mqa")
-        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
-                                      eps=config.rms_norm_eps)
-        self.kv_b_proj = ColumnParallelLinear(
-            self.kv_lora_rank,
-            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.kv_b_proj")
-        # O projection.
-        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
-                                        self.hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.o_proj")
-        if rope_scaling:
-            rope_scaling["rope_type"] = 'deepseek_yarn'
-            self.use_normal_rope = False
-        else:
-            self.use_normal_rope = True
-        self.rotary_emb = get_rope(qk_rope_head_dim,
-                                   rotary_dim=qk_rope_head_dim,
-                                   max_position=max_position_embeddings,
-                                   base=rope_theta,
-                                   rope_scaling=rope_scaling,
-                                   is_neox_style=False)
-
-        if rope_scaling:
-            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
-            scaling_factor = rope_scaling["factor"]
-            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
-            self.scaling = self.scaling * mscale * mscale
-
-        self.attn = Attention(self.num_local_heads,
-                              self.qk_head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_local_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        if self.q_lora_rank is not None:
-            q = self.q_a_proj(hidden_states)[0]
-            q = self.q_a_layernorm(q)
-            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads,
-                                         self.qk_head_dim)
-        else:
-            q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads,
-                                                   self.qk_head_dim)
-        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
-                               dim=-1)
-        latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
-        kv_a, _ = latent_cache.split(
-            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
-        latent_cache = latent_cache.unsqueeze(1)
-        kv_a = self.kv_a_layernorm(kv_a.contiguous())
-        kv = self.kv_b_proj(kv_a)[0]
-        kv = kv.view(-1, self.num_local_heads,
-                     self.qk_nope_head_dim + self.v_head_dim)
-        k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-        k_pe = latent_cache[:, :, self.kv_lora_rank:]
-
-        if self.use_normal_rope:
-            seq_len = positions.size(0)
-            ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape
-            q_pe = q_pe.reshape(seq_len, -1)
-            k_pe = k_pe.reshape(seq_len, -1)
-
-        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
-
-        if self.use_normal_rope:
-            q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape)
-
-        q[..., self.qk_nope_head_dim:] = q_pe
-        k = torch.empty_like(q)
-        k[..., :self.qk_nope_head_dim] = k_nope
-        k[..., self.qk_nope_head_dim:] = k_pe
-        # padding value to qk_head_dim for alignment
-        v = torch.nn.functional.pad(
-            v, [0, self.qk_head_dim - self.v_head_dim],
-            value=0).view(-1, self.num_local_heads * self.qk_head_dim)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
-        attn_output = attn_output.view(
-            -1, self.num_local_heads,
-            self.qk_head_dim)[..., :self.v_head_dim].reshape(
-                -1, self.num_local_heads * self.v_head_dim)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class DeepseekV3MLAAttention(nn.Module):
-    """
-    Main reference: DeepseekV2 paper, and FlashInfer Implementation
-    (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
-    
-    For more info see MLACommonImpl in: vllm/attention/backends/mla/utils.py
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        hidden_size: int,
-        num_heads: int,
-        qk_nope_head_dim: int,
-        qk_rope_head_dim: int,
-        v_head_dim: int,
-        q_lora_rank: Optional[int],
-        kv_lora_rank: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.qk_nope_head_dim = qk_nope_head_dim
-        self.qk_rope_head_dim = qk_rope_head_dim
-        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
-        self.v_head_dim = v_head_dim
-
-        self.q_lora_rank = q_lora_rank
-        self.kv_lora_rank = kv_lora_rank
-
-        self.num_heads = num_heads
-        tp_size = get_tensor_model_parallel_world_size()
-        assert num_heads % tp_size == 0
-        self.num_local_heads = num_heads // tp_size
-
-        self.scaling = self.qk_head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        if self.q_lora_rank is not None:
-            self.q_a_proj = ReplicatedLinear(self.hidden_size,
-                                             self.q_lora_rank,
-                                             bias=False,
-                                             quant_config=quant_config,
-                                             prefix=f"{prefix}.q_a_proj")
-            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
-                                         eps=config.rms_norm_eps)
-            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
-                                                 self.num_heads *
-                                                 self.qk_head_dim,
-                                                 bias=False,
-                                                 quant_config=quant_config,
-                                                 prefix=f"{prefix}.q_b_proj")
-        else:
-            self.q_proj = ColumnParallelLinear(self.hidden_size,
-                                               self.num_heads *
-                                               self.qk_head_dim,
-                                               bias=False,
-                                               quant_config=quant_config,
-                                               prefix=f"{prefix}.q_proj")
-
-        self.kv_a_proj_with_mqa = ReplicatedLinear(
-            self.hidden_size,
-            self.kv_lora_rank + self.qk_rope_head_dim,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.kv_a_proj_with_mqa")
-        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
-                                      eps=config.rms_norm_eps)
-        self.kv_b_proj = ColumnParallelLinear(
-            self.kv_lora_rank,
-            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.kv_b_proj")
-        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
-                                        self.hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.o_proj")
-
-        if rope_scaling:
-            rope_scaling["rope_type"] = 'deepseek_yarn'
-        self.rotary_emb = get_rope(qk_rope_head_dim,
-                                   rotary_dim=qk_rope_head_dim,
-                                   max_position=max_position_embeddings,
-                                   base=rope_theta,
-                                   rope_scaling=rope_scaling,
-                                   is_neox_style=False)
-        if rope_scaling:
-            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
-            scaling_factor = rope_scaling["factor"]
-            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
-            self.scaling = self.scaling * mscale * mscale
-
-        self.mla_attn = Attention(
-            num_heads=self.num_local_heads,
-            head_size=self.kv_lora_rank,
-            scale=self.scaling,
-            num_kv_heads=1,
-            cache_config=cache_config,
-            quant_config=quant_config,
-            prefix=f"{prefix}.attn",
-            use_mla=True,
-            # MLA Args
-            q_lora_rank=self.q_lora_rank,
-            kv_lora_rank=self.kv_lora_rank,
-            qk_nope_head_dim=self.qk_nope_head_dim,
-            qk_rope_head_dim=self.qk_rope_head_dim,
-            qk_head_dim=self.qk_head_dim,
-            v_head_dim=self.v_head_dim,
-            rotary_emb=self.rotary_emb,
-            q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj,
-            kv_b_proj=self.kv_b_proj,
-            o_proj=self.o_proj,
-        )
-
-        self.prefix = prefix
-        self.debug_layer_idx = int(self.prefix.split(".")[-2])
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        if self.q_lora_rank is not None:
-            ckq = self.q_a_proj(hidden_states)[0]
-            hidden_states_or_q_c = self.q_a_layernorm(ckq)
-        else:
-            hidden_states_or_q_c = hidden_states
-        kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
-            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
-        kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
-        return self.mla_attn(hidden_states_or_q_c, kv_c_normed, k_pe, kv_cache,
-                             attn_metadata)
-
-
-class DeepseekV3DecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        prefix: str,
-        model_config: ModelConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        # DecoderLayers are created with `make_layers` which passes the prefix
-        # with the layer's index.
-        layer_idx = int(prefix.split(sep='.')[-1])
-        if model_config.use_mla:
-            attn_cls = DeepseekV3MLAAttention
-        else:
-            attn_cls = DeepseekV3Attention
-        self.self_attn = attn_cls(
-            config=config,
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            qk_nope_head_dim=config.qk_nope_head_dim,
-            qk_rope_head_dim=config.qk_rope_head_dim,
-            v_head_dim=config.v_head_dim,
-            q_lora_rank=config.q_lora_rank
-            if hasattr(config, "q_lora_rank") else None,
-            kv_lora_rank=config.kv_lora_rank,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            cache_config=cache_config,
-            quant_config=quant_config,
-            prefix=f"{prefix}.self_attn",
-        )
-        if (config.n_routed_experts is not None
-                and layer_idx >= config.first_k_dense_replace
-                and layer_idx % config.moe_layer_freq == 0):
-            self.mlp = DeepseekV3MoE(
-                config=config,
-                quant_config=quant_config,
-                prefix=f"{prefix}.mlp",
-            )
-        else:
-            self.mlp = DeepseekV3MLP(
-                hidden_size=config.hidden_size,
-                intermediate_size=config.intermediate_size,
-                hidden_act=config.hidden_act,
-                quant_config=quant_config,
-                prefix=f"{prefix}.mlp",
-            )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
-        residual: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-
-
-@support_torch_compile
-class DeepseekV3Model(nn.Module):
-
-    fall_back_to_pt_during_load = False
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-
-        config = vllm_config.model_config.hf_config
-        model_config = vllm_config.model_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        if get_pp_group().is_first_rank:
-            self.embed_tokens = VocabParallelEmbedding(
-                config.vocab_size,
-                config.hidden_size,
-            )
-        else:
-            self.embed_tokens = PPMissingLayer()
-
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers,
-            lambda prefix: DeepseekV3DecoderLayer(
-                config,
-                prefix,
-                model_config=model_config,
-                cache_config=cache_config,
-                quant_config=quant_config,
-            ),
-            prefix=f"{prefix}.layers")
-
-        if get_pp_group().is_last_rank:
-            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        else:
-            self.norm = PPMissingLayer()
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embed_tokens(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        if get_pp_group().is_first_rank:
-            if inputs_embeds is not None:
-                hidden_states = inputs_embeds
-            else:
-                hidden_states = self.get_input_embeddings(input_ids)
-            residual = None
-        else:
-            assert intermediate_tensors is not None
-            hidden_states = intermediate_tensors["hidden_states"]
-            residual = intermediate_tensors["residual"]
-
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states, residual = layer(positions, hidden_states,
-                                            kv_caches[i - self.start_layer],
-                                            attn_metadata, residual)
-
-        if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
-
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-
-class DeepseekV3ForCausalLM(nn.Module, SupportsPP):
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        self.model = DeepseekV3Model(vllm_config=vllm_config,
-                                     prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config)
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
-                                   inputs_embeds)
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
-    def make_empty_intermediate_tensors(
-            self, batch_size: int, dtype: torch.dtype,
-            device: torch.device) -> IntermediateTensors:
-        return IntermediateTensors({
-            "hidden_states":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-            "residual":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-        })
-
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-
-        # Params for weights, fp8 weight scales, fp8 activation scales
-        # (param_name, weight_name, expert_id, shard_id)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="gate_proj",
-            ckpt_down_proj_name="down_proj",
-            ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts)
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            # TODO(simon): support nextn predict layers
-            if hasattr(self.config, "num_nextn_predict_layers"
-                       ) and self.config.num_nextn_predict_layers > 0:
-                assert self.config.num_nextn_predict_layers == 1
-                layer_idx = self.config.num_hidden_layers
-                if name.startswith(f"model.layers.{layer_idx}"):
-                    continue
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                # Skip non-stacked layers and experts (experts handled below).
-                if weight_name not in name:
-                    continue
-                # We have mlp.experts[0].gate_proj in the checkpoint.
-                # Since we handle the experts below in expert_params_mapping,
-                # we need to skip here BEFORE we update the name, otherwise
-                # name will be updated to mlp.experts[0].gate_up_proj, which
-                # will then be updated below in expert_params_mapping
-                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
-                if (("mlp.experts." in name) and name not in params_dict):
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-
-                    if is_pp_missing_parameter(name, self):
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-
-                    if is_pp_missing_parameter(name, self):
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index b6708f77d..3b2a7069e 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -45,7 +45,7 @@ _TEXT_GENERATION_MODELS = {
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
     "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
-    "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"),
+    "DeepseekV3ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),
     "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"),
-- 
GitLab


From 0408efc6d0c17fba17b2be38d0d0f02e96d2bf9d Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 6 Feb 2025 15:23:50 +0800
Subject: [PATCH 65/65] [Misc] Improve error message for incorrect pynvml
 (#12809)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/platforms/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 9c98942b5..e4767a378 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -41,7 +41,11 @@ def cuda_platform_plugin() -> Optional[str]:
                 is_cuda = True
         finally:
             pynvml.nvmlShutdown()
-    except Exception:
+    except Exception as e:
+        if "nvml" not in e.__class__.__name__.lower():
+            # If the error is not related to NVML, re-raise it.
+            raise e
+
         # CUDA is supported on Jetson, but NVML may not be.
         import os
 
-- 
GitLab