From 3194039c0ee82685af434c9f8023304b4a45124b Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 1 Feb 2025 11:16:19 -0500 Subject: [PATCH 01/65] Apply torch.compile to fused_moe/grouped_topk (#12637) --- vllm/model_executor/layers/fused_moe/fused_moe.py | 1 + vllm/model_executor/models/deepseek_v3.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index c966be99e..c80e6bf07 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -759,6 +759,7 @@ def fused_topk( # This is used by the Deepseek-V2 and Deepseek-V3 model +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) def grouped_topk(hidden_states: torch.Tensor, gating_output: torch.Tensor, topk: int, diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py index f6ab53c85..06ea3dab9 100644 --- a/vllm/model_executor/models/deepseek_v3.py +++ b/vllm/model_executor/models/deepseek_v3.py @@ -27,6 +27,7 @@ from torch import nn from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_world_size, @@ -566,8 +567,7 @@ class DeepseekV3DecoderLayer(nn.Module): return hidden_states, residual -# TODO(simon): check whether we support torch compile for Deepseek V3 -# @support_torch_compile +@support_torch_compile class DeepseekV3Model(nn.Module): fall_back_to_pt_during_load = False -- GitLab From b4e5c03306ebdf58bce503c73b4f3dc5592df114 Mon Sep 17 00:00:00 2001 From: Vicente Herrera Date: Sat, 1 Feb 2025 18:17:29 +0100 Subject: [PATCH 02/65] doc: fixing minor typo in readme.md (#12643) Word "evolved" was mistyped Signed-off-by: Vicente Herrera --------- Signed-off-by: Vicente Herrera --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5fd30f2b1..80c3ba7d1 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Easy, fast, and cheap LLM serving for everyone ## About vLLM is a fast and easy-to-use library for LLM inference and serving. -Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry. +Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry. vLLM is fast with: -- GitLab From baaa2b24da86d63965dbffc34c97c7c4b50288db Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Sun, 2 Feb 2025 15:29:56 +0800 Subject: [PATCH 03/65] [Bugfix] fix moe_wna16 get_quant_method (#12648) Fix https://github.com/vllm-project/vllm/issues/12647 The `get_quant_method` of `moe_wna16` always return moe method, GPTQ-based linear method or AWQ-based linear method, even when the target module is attention layer. https://github.com/vllm-project/vllm/blob/baeded25699f9f4851843306f27f685c4d4ee7c5/vllm/attention/layer.py#L86-L92 Signed-off-by: Jinzhen Lin --- .../layers/quantization/moe_wna16.py | 27 +++++++++---------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 8cd9c0a7e..11a9d4ac5 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -6,16 +6,13 @@ from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.linear import UnquantizedLinearMethod -from vllm.model_executor.layers.quantization.awq import (AWQConfig, - AWQLinearMethod) -from vllm.model_executor.layers.quantization.awq_marlin import ( - AWQMarlinConfig, AWQMarlinLinearMethod) +from vllm.model_executor.layers.quantization.awq import AWQConfig +from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) -from vllm.model_executor.layers.quantization.gptq import (GPTQConfig, - GPTQLinearMethod) +from vllm.model_executor.layers.quantization.gptq import GPTQConfig from vllm.model_executor.layers.quantization.gptq_marlin import ( - GPTQMarlinConfig, GPTQMarlinLinearMethod) + GPTQMarlinConfig) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform @@ -131,18 +128,18 @@ class MoeWNA16Config(QuantizationConfig): else: if self.linear_quant_method == "gptq": if self.use_marlin: - return GPTQMarlinLinearMethod( - GPTQMarlinConfig.from_config(self.full_config)) + return GPTQMarlinConfig.from_config( + self.full_config).get_quant_method(layer, prefix) else: - return GPTQLinearMethod( - GPTQConfig.from_config(self.full_config)) + return GPTQConfig.from_config( + self.full_config).get_quant_method(layer, prefix) elif self.linear_quant_method == "awq": if self.use_marlin: - return AWQMarlinLinearMethod( - AWQMarlinConfig.from_config(self.full_config)) + return AWQMarlinConfig.from_config( + self.full_config).get_quant_method(layer, prefix) else: - return AWQLinearMethod( - AWQConfig.from_config(self.full_config)) + return AWQConfig.from_config( + self.full_config).get_quant_method(layer, prefix) else: raise ValueError("moe_wna16 only support gptq and awq.") -- GitLab From e497f33491671abbf94a3e563d55ca2818ee09db Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Sun, 2 Feb 2025 02:35:50 -0500 Subject: [PATCH 04/65] [Core] Silence unnecessary deprecation warnings (#12620) I noticed during testing that I was getting a lot of these deprecation warnings about `local_lora_path`: ``` DeprecationWarning: The 'lora_local_path' attribute is deprecated and will be removed in a future version. Please use 'lora_path' instead. ``` The check used for emitting this warning was always True, even when the parameter was not actually specified. It will always be in `__struct_fields__`. We should be checking for a non-None value, instead. Signed-off-by: Russell Bryant Signed-off-by: Russell Bryant --- vllm/lora/request.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/lora/request.py b/vllm/lora/request.py index c4b26dc92..5e3d2f0ed 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -31,7 +31,7 @@ class LoRARequest( base_model_name: Optional[str] = msgspec.field(default=None) def __post_init__(self): - if 'lora_local_path' in self.__struct_fields__: + if self.lora_local_path: warnings.warn( "The 'lora_local_path' attribute is deprecated " "and will be removed in a future version. " -- GitLab From abfcdcdf27eb54d2a2104b4bf5091a24ea4ff928 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 1 Feb 2025 23:43:20 -0800 Subject: [PATCH 05/65] [V1][Minor] Avoid frequently creating ConstantList (#12653) A small optimization to avoid creating a new `ConstantList` every time `request.kv_block_hashes` is used. Signed-off-by: Woosuk Kwon --- vllm/v1/request.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 2cfcd8b63..80160c673 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -64,6 +64,7 @@ class Request: # Cache the computed kv block hashes of the request to avoid # recomputing. self._kv_block_hashes: List[BlockHashType] = [] + self.kv_block_hashes = ConstantList(self._kv_block_hashes) # Read-only views # Prevent directly appending to the these lists since @@ -121,13 +122,9 @@ class Request: num_tokens = self.mm_positions[input_id]["length"] return num_tokens - @property - def kv_block_hashes(self) -> ConstantList["BlockHashType"]: - # Prevent directly appending to the kv_block_hashes. - return ConstantList(self._kv_block_hashes) - def set_kv_block_hashes(self, value: List["BlockHashType"]) -> None: self._kv_block_hashes = value + self.kv_block_hashes = ConstantList(self._kv_block_hashes) def append_kv_block_hashes(self, block_hash: "BlockHashType") -> None: self._kv_block_hashes.append(block_hash) -- GitLab From f8ece6e17fbf4ff3a98d6d53cb3a03c50c02828c Mon Sep 17 00:00:00 2001 From: Shawn Du Date: Sun, 2 Feb 2025 16:40:58 +0800 Subject: [PATCH 06/65] [Core][v1] Unify allocating slots in prefill and decode in KV cache manager (#12608) As mentioned in RFC https://github.com/vllm-project/vllm/issues/12254, this PR achieves the task: combine allocate_slots and append_slots. There should be no functionality change, except that in decode, also raise exception when num_tokens is zero (like prefill), and change the unit test case accordingly. @comaniac @rickyyx @WoosukKwon @youkaichao @heheda12345 @simon-mo --------- Signed-off-by: Shawn Du --- tests/v1/core/test_prefix_caching.py | 24 ++-- vllm/v1/core/kv_cache_manager.py | 168 ++++++++++----------------- vllm/v1/core/scheduler.py | 2 +- 3 files changed, 78 insertions(+), 116 deletions(-) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index f434fa8c6..5c1cda285 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -164,7 +164,7 @@ def test_decode(): req0.num_computed_tokens = 55 for _ in range(4): req0.append_output_token_ids(8) - new_blocks = manager.append_slots(req0, 4) + new_blocks = manager.allocate_slots(req0, 4) assert new_blocks is not None and len(new_blocks) == 0 assert manager.req_to_blocks[req0.request_id][-2].block_hash is None @@ -175,7 +175,7 @@ def test_decode(): # the preallocated block. for _ in range(5 + 10): req0.append_output_token_ids(7) - new_blocks = manager.append_slots(req0, 15) + new_blocks = manager.allocate_slots(req0, 15) assert new_blocks is not None and len(new_blocks) == 0 assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None @@ -185,7 +185,7 @@ def test_decode(): # the preallocated block. for _ in range(6 + 11): req0.append_output_token_ids(12) - new_blocks = manager.append_slots(req0, 17) + new_blocks = manager.allocate_slots(req0, 17) # Plus one preallocated block. assert new_blocks is not None and len(new_blocks) == 2 @@ -395,12 +395,14 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int): req.num_computed_tokens = block_size assert len(blocks) == 1 + num_preallocated_blocks - # Assume all computed. - manager.append_slots(req, block_size * (len(blocks) - 1)) - req.num_computed_tokens = block_size * len(blocks) + # Assume all computed, only when num_preallocate_tokens > 0, we need to + # consume the previously preallocated blocks. + if num_preallocated_blocks > 0: + manager.allocate_slots(req, block_size * (len(blocks) - 1)) + req.num_computed_tokens = block_size * len(blocks) # Append 1 block. - blocks = manager.append_slots(req, block_size) + blocks = manager.allocate_slots(req, block_size) assert len(blocks) == 1 + num_preallocated_blocks @@ -503,7 +505,7 @@ def test_mm_prefix_caching(): # Append slots without allocating a new block. for _ in range(5): req0.append_output_token_ids(8) - new_blocks = manager.append_slots(req0, 5) + new_blocks = manager.allocate_slots(req0, 5) assert new_blocks is not None and len(new_blocks) == 0 # The just completed block should have hashes with extra keys. @@ -603,7 +605,7 @@ def test_reset_prefix_cache(): unique_token_ids = [3] * 7 all_token_ids = full_block_token_ids + unique_token_ids req0 = make_request("0", all_token_ids) - blocks = manager.allocate_slots(req0, 55, []) + blocks = manager.allocate_slots(req0, 55) assert [b.block_id for b in blocks] == [0, 1, 2, 3] unique_token_ids = [4] * 7 @@ -639,7 +641,7 @@ def test_uncache_blocks(): ) req0 = make_request("0", list(range(30))) - blocks = manager.allocate_slots(req0, 30, []) + blocks = manager.allocate_slots(req0, 30) assert [b.block_id for b in blocks] == [0, 1] assert len(manager.cached_block_hash_to_block) == 1 @@ -648,7 +650,7 @@ def test_uncache_blocks(): # Simulate speculative tokens. for _ in range(5): req0.append_output_token_ids(8) - manager.append_slots(req0, 5) + manager.allocate_slots(req0, 5) assert len(manager.cached_block_hash_to_block) == 2 # After sampling, assuming only 1 token is accepted. diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index d6c612f15..7176ec954 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import Dict, Iterable, List, Optional, Tuple +from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple from vllm.logger import init_logger from vllm.utils import cdiv @@ -67,7 +67,8 @@ class KVCacheManager: # Mapping from request ID to blocks to track the blocks allocated # for each request, so that we can free the blocks when the request # is finished. - self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {} + self.req_to_blocks: DefaultDict[str, + List[KVCacheBlock]] = defaultdict(list) @property def usage(self) -> float: @@ -115,33 +116,75 @@ class KVCacheManager: num_computed_tokens = len(computed_blocks) * self.block_size return computed_blocks, num_computed_tokens - def append_slots( + def allocate_slots( self, request: Request, num_tokens: int, + new_computed_blocks: Optional[List[KVCacheBlock]] = None ) -> Optional[List[KVCacheBlock]]: - """Append slots to the block table of the request. - We first append slots to already allocated blocks. If the allocated - blocks are not enough, we allocate new blocks. + """Add slots for a request with new tokens to append. Args: - request: The request to append slots. - num_tokens: The number of tokens to append. + request: The request to allocate slots. + num_tokens: The number of tokens to allocate. Note that this does + not include the tokens that have already been computed. + new_computed_blocks: A list of new computed blocks just hitting the + prefix caching. + + Blocks layout: + ----------------------------------------------------------------------- + | < computed > | < new computed > | < new > | < pre-allocated > | + ----------------------------------------------------------------------- + | < required > | + -------------------------------------------------- + | < full > | + ------------------------------------------------ + | | + -------------- + The following *_blocks are illustrated in this layout. Returns: - A list of new blocks if new blocks are allocated, or None - if new blocks are required but cannot be allocated. + A list of new allocated blocks. """ - num_required_blocks = cdiv(request.num_computed_tokens + num_tokens, + if num_tokens == 0: + raise ValueError("num_tokens must be greater than 0") + + new_computed_blocks = new_computed_blocks or [] + + # The number of computed tokens is the number of computed tokens plus + # the new prefix caching hits + num_computed_tokens = (request.num_computed_tokens + + len(new_computed_blocks) * self.block_size) + num_required_blocks = cdiv(num_computed_tokens + num_tokens, self.block_size) req_blocks = self.req_to_blocks[request.request_id] + num_new_blocks = (num_required_blocks - len(req_blocks) - + len(new_computed_blocks)) - num_new_blocks = num_required_blocks - len(req_blocks) - if num_new_blocks > self.free_block_queue.num_free_blocks: - # Need to allocate new blocks due to insufficient pre-allocated - # slots, but we cannot allocate new blocks due to the limit. + # If a computed block of a request is an eviction candidate (in the + # free queue and ref_cnt == 0), it cannot be counted as a free block + # when allocating this request. + num_evictable_computed_blocks = sum(1 for blk in new_computed_blocks + if blk.ref_cnt == 0) + if (num_new_blocks > self.free_block_queue.num_free_blocks - + num_evictable_computed_blocks): + # Cannot allocate new blocks return None + # Touch the computed blocks to make sure they won't be evicted. + if self.enable_caching: + self._touch(new_computed_blocks) + else: + assert not new_computed_blocks, ( + "Computed blocks should be empty when " + "prefix caching is disabled") + + # Append the new computed blocks to the request blocks until now to + # avoid the case where the new blocks cannot be allocated. + req_blocks.extend(new_computed_blocks) + + # Start to handle new blocks + if num_new_blocks <= 0: # No new block is needed. new_blocks = [] @@ -160,112 +203,29 @@ class KVCacheManager: ) assert num_new_blocks > 0 + # Concatenate the computed block IDs and the new block IDs. new_blocks = self._get_new_blocks(num_new_blocks) req_blocks.extend(new_blocks) if not self.enable_caching: return new_blocks - num_computed_full_blocks = (request.num_computed_tokens // - self.block_size) - # NOTE(rickyx): We are assuming the `num_tokens` are actual # tokens rather than lookahead slots (e.g. for speculative decoding). # TODO(rickyx): When supporting speculative decoding, we will need to # differentiate between them so that we can know how many blocks are # full after appending the actual tokens. - num_full_blocks_after_append = (request.num_computed_tokens + - num_tokens) // self.block_size - assert num_full_blocks_after_append <= len(req_blocks) - - new_full_blocks = req_blocks[ - num_computed_full_blocks:num_full_blocks_after_append] - if new_full_blocks: - self._cache_full_blocks( - request=request, - blk_start_idx=num_computed_full_blocks, - full_blocks=new_full_blocks, - prev_block=req_blocks[num_computed_full_blocks - 1] - if num_computed_full_blocks >= 1 else None, - ) - - return new_blocks - - def allocate_slots( - self, - request: Request, - num_tokens: int, - computed_blocks: List[KVCacheBlock], - ) -> Optional[List[KVCacheBlock]]: - """Allocate slots for a new request. - - Args: - request: The request to allocate slots. - num_tokens: The number of tokens to allocate. Note that this does - not include the tokens that have already been computed. - computed_blocks: A list of computed blocks. - - Returns: - A list of new allocated blocks. - """ - if num_tokens == 0: - raise ValueError( - f"num_tokens must be greater than 0, got {num_tokens}") - - # If a computed block of a request is an eviction candidate (in the - # free queue and ref_cnt == 0), it cannot be counted as a free block - # when allocating this request. - num_evictable_computed_blocks = sum(1 for blk in computed_blocks - if blk.ref_cnt == 0) - - num_required_blocks = cdiv(num_tokens, self.block_size) - if (num_required_blocks > self.free_block_queue.num_free_blocks - - num_evictable_computed_blocks): - # Cannot allocate new blocks. - return None - - # Touch the computed blocks to make sure they won't be evicted. - if self.enable_caching: - self._touch(computed_blocks) - else: - assert not computed_blocks, ( - "Computed blocks should be empty when " - "prefix caching is disabled") - - # Determine the number of new blocks to allocate considering - # preallocated blocks. - num_new_blocks = min( - num_required_blocks + self.num_preallocate_blocks, - self.free_block_queue.num_free_blocks, - # Should not exceed the maximum number of blocks per request. - # This is especially because the block table has the shape - # [..., max_num_blocks_per_req]. - # TODO(woosuk): Check and reject requests if - # num_prompt_tokens + max_tokens > max_model_len. - self.max_num_blocks_per_req - len(computed_blocks), - ) - assert num_new_blocks > 0 - - # Concatenate the computed block IDs and the new block IDs. - new_blocks = self._get_new_blocks(num_new_blocks) - self.req_to_blocks[request.request_id] = computed_blocks + new_blocks - - if not self.enable_caching: - return new_blocks - - num_computed_tokens = len(computed_blocks) * self.block_size num_full_blocks = (num_computed_tokens + num_tokens) // self.block_size - - new_full_blocks = self.req_to_blocks[ - request.request_id][len(computed_blocks):num_full_blocks] + num_computed_full_blocks = num_computed_tokens // self.block_size + new_full_blocks = req_blocks[num_computed_full_blocks:num_full_blocks] if new_full_blocks: self._cache_full_blocks( request=request, - blk_start_idx=len(computed_blocks), + blk_start_idx=num_computed_full_blocks, # The new full blocks are the full blocks that are not computed. full_blocks=new_full_blocks, - prev_block=computed_blocks[-1] if computed_blocks else None, - ) + prev_block=(req_blocks[num_computed_full_blocks - 1] + if num_computed_full_blocks > 0 else None)) return new_blocks diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 910fc4ff4..27c9ac1ae 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -138,7 +138,7 @@ class Scheduler: assert num_new_tokens > 0 while True: - new_blocks = self.kv_cache_manager.append_slots( + new_blocks = self.kv_cache_manager.allocate_slots( request, num_new_tokens) if new_blocks is None: # The request cannot be scheduled. -- GitLab From f256ebe4df6757d76f1f1642d7e110268a2f8190 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sun, 2 Feb 2025 18:17:26 +0800 Subject: [PATCH 07/65] [Hardware][Intel GPU] add XPU bf16 support (#12392) Signed-off-by: Kunshang Ji --- .../installation/gpu/xpu.inc.md | 2 +- vllm/platforms/xpu.py | 23 ++++++++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md index 411682678..ef02d9a07 100644 --- a/docs/source/getting_started/installation/gpu/xpu.inc.md +++ b/docs/source/getting_started/installation/gpu/xpu.inc.md @@ -36,7 +36,7 @@ VLLM_TARGET_DEVICE=xpu python setup.py install :::{note} - FP16 is the default data type in the current XPU backend. The BF16 data - type will be supported in the future. + type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet. ::: ## Set up using Docker diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index a5ca77f57..039cdd5ad 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -66,9 +66,14 @@ class XPUPlatform(Platform): # check and update model config model_config = vllm_config.model_config if model_config.dtype == torch.bfloat16: - logger.warning( - "bfloat16 is not fully supported on XPU, casting to float16.") - model_config.dtype = torch.float16 + bf16_supported = cls.device_support_bf16() + if not bf16_supported: + logger.warning( + "bfloat16 is only supported on Intel Data Center GPU, " + "Intel Arc GPU is not supported yet. Your device is %s," + "which is not supported. will fallback to float16", + cls.get_device_name()) + model_config.dtype = torch.float16 if not model_config.enforce_eager: logger.warning( "CUDA graph is not supported on XPU, fallback to the eager " @@ -116,3 +121,15 @@ class XPUPlatform(Platform): ) -> float: torch.xpu.reset_peak_memory_stats(device) return torch.xpu.max_memory_allocated(device) + + @classmethod + def device_support_bf16(cls) -> bool: + device_name = cls.get_device_name().lower() + if device_name.count("arc") > 0: + return False + elif device_name.count("data center gpu") > 0: + return True + else: + logger.warning("Unknown device name %s, always use float16", + device_name) + return False -- GitLab From e489ad7a210f4234db696d1f2749d5f3662fa65b Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Sun, 2 Feb 2025 14:58:18 -0500 Subject: [PATCH 08/65] [Misc] Add SPDX-License-Identifier headers to python source files (#12628) - **Add SPDX license headers to python source files** - **Check for SPDX headers using pre-commit** commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745 Author: Russell Bryant Date: Fri Jan 31 14:18:24 2025 -0500 Add SPDX license headers to python source files This commit adds SPDX license headers to python source files as recommended to the project by the Linux Foundation. These headers provide a concise way that is both human and machine readable for communicating license information for each source file. It helps avoid any ambiguity about the license of the code and can also be easily used by tools to help manage license compliance. The Linux Foundation runs license scans against the codebase to help ensure we are in compliance with the licenses of the code we use, including dependencies. Having these headers in place helps that tool do its job. More information can be found on the SPDX site: - https://spdx.dev/learn/handling-license-info/ Signed-off-by: Russell Bryant commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea Author: Russell Bryant Date: Fri Jan 31 14:36:32 2025 -0500 Check for SPDX headers using pre-commit Signed-off-by: Russell Bryant --------- Signed-off-by: Russell Bryant --- .buildkite/check-wheel-size.py | 2 + .buildkite/generate_index.py | 2 + .../test_lm_eval_correctness.py | 1 + .../convert-results-json-to-markdown.py | 2 + .../scripts/download-tokenizer.py | 2 + .../scripts/generate-nightly-markdown.py | 2 + .../scripts/get-lmdeploy-modelname.py | 2 + .../scripts/summary-nightly-results.py | 2 + .pre-commit-config.yaml | 6 ++- benchmarks/backend_request_func.py | 2 + benchmarks/benchmark_guided.py | 1 + benchmarks/benchmark_latency.py | 1 + .../benchmark_long_document_qa_throughput.py | 1 + benchmarks/benchmark_prefix_caching.py | 1 + benchmarks/benchmark_prioritization.py | 1 + benchmarks/benchmark_serving.py | 1 + benchmarks/benchmark_serving_guided.py | 1 + benchmarks/benchmark_throughput.py | 1 + .../cutlass_benchmarks/sparse_benchmarks.py | 2 + benchmarks/cutlass_benchmarks/utils.py | 2 + .../cutlass_benchmarks/w8a8_benchmarks.py | 2 + .../cutlass_benchmarks/weight_shapes.py | 2 + .../disagg_prefill_proxy_server.py | 2 + .../disagg_benchmarks/round_robin_proxy.py | 2 + .../visualize_benchmark_results.py | 2 + .../fused_kernels/layernorm_rms_benchmarks.py | 2 + benchmarks/kernels/benchmark_aqlm.py | 2 + benchmarks/kernels/benchmark_layernorm.py | 2 + benchmarks/kernels/benchmark_lora.py | 2 + benchmarks/kernels/benchmark_machete.py | 2 + benchmarks/kernels/benchmark_marlin.py | 2 + benchmarks/kernels/benchmark_moe.py | 2 + .../kernels/benchmark_paged_attention.py | 2 + benchmarks/kernels/benchmark_quant.py | 2 + benchmarks/kernels/benchmark_rmsnorm.py | 2 + benchmarks/kernels/benchmark_rope.py | 2 + benchmarks/kernels/benchmark_shapes.py | 2 + benchmarks/kernels/graph_machete_bench.py | 2 + benchmarks/kernels/utils.py | 2 + benchmarks/kernels/weight_shapes.py | 2 + benchmarks/overheads/benchmark_hashing.py | 2 + cmake/hipify.py | 2 + collect_env.py | 2 + .../vllm_cutlass_library_extension.py | 2 + csrc/quantization/machete/generate.py | 2 + docs/source/conf.py | 2 + docs/source/generate_examples.py | 2 + examples/offline_inference/aqlm_example.py | 2 + examples/offline_inference/arctic.py | 2 + examples/offline_inference/audio_language.py | 1 + examples/offline_inference/basic.py | 2 + .../basic_with_model_default_sampling.py | 2 + examples/offline_inference/chat.py | 2 + examples/offline_inference/chat_with_tools.py | 2 + examples/offline_inference/classification.py | 2 + examples/offline_inference/cli.py | 2 + examples/offline_inference/cpu_offload.py | 2 + examples/offline_inference/distributed.py | 1 + examples/offline_inference/embedding.py | 2 + examples/offline_inference/encoder_decoder.py | 1 + .../offline_inference/florence2_inference.py | 1 + examples/offline_inference/gguf_inference.py | 2 + .../offline_inference/llm_engine_example.py | 2 + .../lora_with_quantization_inference.py | 1 + examples/offline_inference/mlpspeculator.py | 2 + .../offline_inference/multilora_inference.py | 1 + examples/offline_inference/neuron.py | 2 + .../neuron_int8_quantization.py | 2 + examples/offline_inference/pixtral.py | 2 + examples/offline_inference/prefix_caching.py | 2 + examples/offline_inference/profiling.py | 2 + .../profiling_tpu/profiling.py | 2 + examples/offline_inference/rlhf.py | 1 + .../offline_inference/save_sharded_state.py | 1 + examples/offline_inference/scoring.py | 2 + .../offline_inference/simple_profiling.py | 2 + .../offline_inference/structured_outputs.py | 2 + .../offline_inference/torchrun_example.py | 1 + examples/offline_inference/tpu.py | 2 + examples/offline_inference/vision_language.py | 1 + .../vision_language_embedding.py | 1 + .../vision_language_multi_image.py | 1 + examples/offline_inference/whisper.py | 2 + examples/online_serving/api_client.py | 1 + .../online_serving/cohere_rerank_client.py | 1 + .../gradio_openai_chatbot_webserver.py | 2 + examples/online_serving/gradio_webserver.py | 2 + .../online_serving/jinaai_rerank_client.py | 1 + .../openai_chat_completion_client.py | 2 + ...i_chat_completion_client_for_multimodal.py | 1 + ...penai_chat_completion_client_with_tools.py | 1 + ...enai_chat_completion_structured_outputs.py | 2 + .../openai_chat_completion_with_reasoning.py | 1 + ...hat_completion_with_reasoning_streaming.py | 1 + ...ai_chat_embedding_client_for_multimodal.py | 2 + .../openai_completion_client.py | 2 + .../openai_cross_encoder_score.py | 1 + .../online_serving/openai_embedding_client.py | 2 + .../online_serving/openai_pooling_client.py | 1 + .../opentelemetry/dummy_client.py | 2 + examples/other/tensorize_vllm_model.py | 2 + find_cuda_init.py | 2 + python_only_dev.py | 2 + setup.py | 2 + tests/async_engine/api_server_async_engine.py | 1 + tests/async_engine/test_api_server.py | 2 + tests/async_engine/test_async_llm_engine.py | 2 + tests/async_engine/test_request_tracker.py | 2 + .../test_basic_correctness.py | 1 + .../basic_correctness/test_chunked_prefill.py | 1 + tests/basic_correctness/test_cpu_offload.py | 2 + tests/basic_correctness/test_cumem.py | 2 + tests/basic_correctness/test_preemption.py | 1 + tests/compile/backend.py | 2 + tests/compile/piecewise/test_simple.py | 1 + tests/compile/piecewise/test_toy_llama.py | 1 + tests/compile/test_basic_correctness.py | 2 + tests/compile/test_full_graph.py | 2 + tests/compile/test_functionalization.py | 2 + tests/compile/test_fusion.py | 2 + tests/compile/test_pass_manager.py | 2 + tests/compile/test_wrapper.py | 2 + tests/compile/utils.py | 2 + tests/conftest.py | 2 + tests/core/block/conftest.py | 2 + tests/core/block/e2e/conftest.py | 2 + tests/core/block/e2e/test_correctness.py | 2 + .../e2e/test_correctness_sliding_window.py | 2 + tests/core/block/test_block_manager.py | 2 + tests/core/block/test_block_table.py | 2 + tests/core/block/test_common.py | 2 + .../block/test_cpu_gpu_block_allocator.py | 2 + tests/core/block/test_naive_block.py | 2 + tests/core/block/test_prefix_caching_block.py | 2 + tests/core/test_chunked_prefill_scheduler.py | 2 + tests/core/test_num_computed_tokens_update.py | 2 + tests/core/test_scheduler.py | 2 + tests/core/test_scheduler_encoder_decoder.py | 2 + tests/core/test_serialization.py | 2 + tests/core/utils.py | 2 + tests/distributed/test_ca_buffer_sharing.py | 2 + tests/distributed/test_comm_ops.py | 1 + tests/distributed/test_custom_all_reduce.py | 2 + tests/distributed/test_distributed_oot.py | 2 + .../distributed/test_multi_node_assignment.py | 1 + tests/distributed/test_pipeline_parallel.py | 1 + tests/distributed/test_pipeline_partition.py | 2 + tests/distributed/test_pp_cudagraph.py | 2 + tests/distributed/test_pynccl.py | 2 + tests/distributed/test_same_node.py | 2 + tests/distributed/test_shm_broadcast.py | 2 + tests/distributed/test_torchrun_example.py | 2 + tests/distributed/test_utils.py | 2 + tests/encoder_decoder/test_e2e_correctness.py | 1 + .../output_processor/test_multi_step.py | 2 + .../output_processor/test_stop_checker.py | 2 + tests/engine/test_arg_utils.py | 2 + tests/engine/test_computed_prefix_blocks.py | 2 + tests/engine/test_custom_executor.py | 2 + tests/engine/test_detokenization.py | 2 + tests/engine/test_multiproc_workers.py | 2 + tests/engine/test_short_mm_context.py | 2 + tests/engine/test_skip_tokenizer_init.py | 2 + tests/engine/test_stop_reason.py | 1 + tests/engine/test_stop_strings.py | 2 + tests/entrypoints/conftest.py | 2 + tests/entrypoints/llm/test_accuracy.py | 1 + tests/entrypoints/llm/test_chat.py | 2 + tests/entrypoints/llm/test_collective_rpc.py | 2 + tests/entrypoints/llm/test_encode.py | 2 + tests/entrypoints/llm/test_generate.py | 2 + .../llm/test_generate_multiple_loras.py | 2 + tests/entrypoints/llm/test_gpu_utilization.py | 2 + tests/entrypoints/llm/test_guided_generate.py | 2 + tests/entrypoints/llm/test_init.py | 2 + tests/entrypoints/llm/test_lazy_outlines.py | 2 + .../entrypoints/llm/test_prompt_validation.py | 2 + .../offline_mode/test_offline_mode.py | 1 + .../test_deepseekr1_reasoning_parser.py | 2 + .../openai/reasoning_parsers/utils.py | 2 + tests/entrypoints/openai/test_accuracy.py | 1 + .../openai/test_async_tokenization.py | 2 + tests/entrypoints/openai/test_audio.py | 2 + tests/entrypoints/openai/test_basic.py | 2 + tests/entrypoints/openai/test_chat.py | 2 + tests/entrypoints/openai/test_chat_echo.py | 2 + .../entrypoints/openai/test_chat_template.py | 2 + .../entrypoints/openai/test_chunked_prompt.py | 2 + tests/entrypoints/openai/test_cli_args.py | 2 + tests/entrypoints/openai/test_completion.py | 2 + tests/entrypoints/openai/test_embedding.py | 2 + .../openai/test_encoder_decoder.py | 2 + .../entrypoints/openai/test_lora_adapters.py | 2 + tests/entrypoints/openai/test_metrics.py | 2 + tests/entrypoints/openai/test_models.py | 2 + .../openai/test_oot_registration.py | 2 + tests/entrypoints/openai/test_pooling.py | 2 + .../openai/test_prompt_validation.py | 2 + tests/entrypoints/openai/test_rerank.py | 2 + .../openai/test_return_tokens_as_ids.py | 2 + tests/entrypoints/openai/test_root_path.py | 2 + tests/entrypoints/openai/test_run_batch.py | 2 + tests/entrypoints/openai/test_score.py | 2 + tests/entrypoints/openai/test_serving_chat.py | 2 + .../entrypoints/openai/test_serving_models.py | 2 + tests/entrypoints/openai/test_shutdown.py | 2 + tests/entrypoints/openai/test_tokenization.py | 2 + tests/entrypoints/openai/test_video.py | 2 + tests/entrypoints/openai/test_vision.py | 2 + .../openai/test_vision_embedding.py | 2 + .../tool_parsers/test_pythonic_tool_parser.py | 2 + .../entrypoints/openai/tool_parsers/utils.py | 2 + tests/entrypoints/test_chat_utils.py | 2 + tests/kernels/allclose_default.py | 2 + tests/kernels/conftest.py | 2 + tests/kernels/quant_utils.py | 2 + tests/kernels/test_activation.py | 2 + tests/kernels/test_aqlm.py | 2 + tests/kernels/test_attention.py | 2 + tests/kernels/test_attention_selector.py | 2 + tests/kernels/test_awq.py | 2 + tests/kernels/test_awq_marlin.py | 1 + tests/kernels/test_awq_triton.py | 1 + tests/kernels/test_block_fp8.py | 2 + tests/kernels/test_blocksparse_attention.py | 2 + tests/kernels/test_cache.py | 2 + tests/kernels/test_cascade_flash_attn.py | 2 + tests/kernels/test_causal_conv1d.py | 2 + tests/kernels/test_cutlass.py | 1 + tests/kernels/test_cutlass_2of4_sparse.py | 1 + tests/kernels/test_encoder_decoder_attn.py | 1 + tests/kernels/test_flash_attn.py | 2 + tests/kernels/test_flashinfer.py | 2 + tests/kernels/test_fp8_quant.py | 2 + tests/kernels/test_fused_quant_layernorm.py | 2 + tests/kernels/test_ggml.py | 2 + tests/kernels/test_gguf.py | 2 + tests/kernels/test_gptq.py | 2 + tests/kernels/test_int8_quant.py | 2 + tests/kernels/test_layernorm.py | 2 + tests/kernels/test_machete_mm.py | 1 + tests/kernels/test_mamba_ssm.py | 2 + tests/kernels/test_marlin_gemm.py | 1 + tests/kernels/test_mha_attn.py | 1 + tests/kernels/test_moe.py | 1 + tests/kernels/test_permute_cols.py | 2 + tests/kernels/test_pos_encoding.py | 2 + tests/kernels/test_prefix_prefill.py | 2 + tests/kernels/test_rotary_embedding.py | 1 + tests/kernels/test_triton_decode_attention.py | 2 + tests/kernels/test_triton_scaled_mm.py | 1 + tests/kernels/test_utils.py | 1 + tests/kernels/utils.py | 1 + tests/kv_transfer/disagg_test.py | 2 + tests/kv_transfer/module_test.py | 2 + tests/kv_transfer/test_lookup_buffer.py | 2 + tests/kv_transfer/test_send_recv.py | 2 + tests/lora/conftest.py | 2 + tests/lora/data/long_context_test_data.py | 2 + tests/lora/test_baichuan.py | 2 + tests/lora/test_chatglm3_tp.py | 2 + tests/lora/test_gemma.py | 2 + tests/lora/test_jamba.py | 2 + tests/lora/test_layers.py | 2 + tests/lora/test_llama_tp.py | 2 + tests/lora/test_long_context.py | 2 + tests/lora/test_lora_bias_e2e.py | 2 + tests/lora/test_lora_checkpoints.py | 2 + tests/lora/test_lora_huggingface.py | 2 + tests/lora/test_lora_manager.py | 2 + tests/lora/test_minicpmv_tp.py | 2 + tests/lora/test_mixtral.py | 2 + tests/lora/test_peft_helper.py | 2 + tests/lora/test_phi.py | 2 + tests/lora/test_punica_ops_sizes.py | 1 + tests/lora/test_punica_ops_variation.py | 1 + tests/lora/test_quant_model.py | 2 + tests/lora/test_qwen2vl.py | 2 + tests/lora/test_tokenizer_group.py | 2 + tests/lora/test_utils.py | 2 + tests/lora/test_worker.py | 2 + tests/lora/utils.py | 2 + tests/metrics/test_metrics.py | 2 + tests/model_executor/conftest.py | 2 + .../model_executor/test_enabled_custom_ops.py | 2 + .../model_executor/test_guided_processors.py | 2 + .../test_model_load_with_params.py | 2 + tests/model_executor/weight_utils.py | 2 + .../audio_language/test_ultravox.py | 2 + .../models/decoder_only/language/test_aqlm.py | 1 + .../models/decoder_only/language/test_fp8.py | 2 + .../models/decoder_only/language/test_gguf.py | 1 + .../decoder_only/language/test_gptq_marlin.py | 1 + .../language/test_gptq_marlin_24.py | 1 + .../decoder_only/language/test_granite.py | 1 + .../decoder_only/language/test_jamba.py | 2 + .../decoder_only/language/test_mamba.py | 1 + .../decoder_only/language/test_mistral.py | 1 + .../decoder_only/language/test_modelopt.py | 2 + .../decoder_only/language/test_models.py | 1 + .../decoder_only/language/test_phimoe.py | 1 + .../decoder_only/vision_language/test_awq.py | 2 + .../vision_language/test_h2ovl.py | 2 + .../vision_language/test_intern_vit.py | 2 + .../vision_language/test_models.py | 1 + .../vision_language/test_phi3v.py | 2 + .../vision_language/test_pixtral.py | 1 + .../vision_language/test_qwen2_vl.py | 2 + .../vision_language/vlm_utils/builders.py | 1 + .../vlm_utils/case_filtering.py | 1 + .../vision_language/vlm_utils/core.py | 3 +- .../vlm_utils/custom_inputs.py | 1 + .../vision_language/vlm_utils/model_utils.py | 1 + .../vision_language/vlm_utils/runners.py | 1 + .../vision_language/vlm_utils/types.py | 1 + .../embedding/language/test_cls_models.py | 1 + .../embedding/language/test_embedding.py | 1 + .../models/embedding/language/test_gritlm.py | 2 + .../models/embedding/language/test_scoring.py | 1 + tests/models/embedding/utils.py | 2 + .../vision_language/test_dse_qwen2_vl.py | 2 + .../vision_language/test_llava_next.py | 2 + .../embedding/vision_language/test_phi3v.py | 2 + .../audio_language/test_whisper.py | 1 + .../encoder_decoder/language/test_bart.py | 1 + .../vision_language/test_broadcast.py | 2 + .../vision_language/test_florence2.py | 2 + .../vision_language/test_mllama.py | 2 + .../multimodal/processing/test_common.py | 2 + .../multimodal/processing/test_idefics3.py | 1 + .../multimodal/processing/test_internvl.py | 1 + .../multimodal/processing/test_llava_next.py | 2 + .../processing/test_llava_onevision.py | 2 + .../multimodal/processing/test_phi3v.py | 1 + .../multimodal/processing/test_qwen2_vl.py | 2 + tests/models/registry.py | 2 + tests/models/test_initialization.py | 2 + tests/models/test_oot_registration.py | 2 + tests/models/test_registry.py | 2 + tests/models/utils.py | 2 + tests/mq_llm_engine/test_abort.py | 1 + tests/mq_llm_engine/test_error_handling.py | 1 + tests/mq_llm_engine/test_load.py | 1 + tests/mq_llm_engine/utils.py | 2 + .../multi_step/test_correctness_async_llm.py | 2 + tests/multi_step/test_correctness_llm.py | 2 + tests/multimodal/test_inputs.py | 2 + tests/multimodal/test_processing.py | 2 + tests/multimodal/test_processor_kwargs.py | 2 + tests/multimodal/test_utils.py | 2 + tests/multimodal/utils.py | 2 + tests/neuron/test_prefix_prefill.py | 2 + tests/plugins/vllm_add_dummy_model/setup.py | 2 + .../vllm_add_dummy_model/__init__.py | 2 + .../my_gemma_embedding.py | 2 + .../vllm_add_dummy_model/my_llava.py | 2 + .../vllm_add_dummy_model/my_opt.py | 2 + .../plugins/vllm_add_dummy_platform/setup.py | 2 + .../vllm_add_dummy_platform/__init__.py | 2 + .../dummy_attention_backend.py | 2 + .../vllm_add_dummy_platform/dummy_platform.py | 2 + tests/plugins_tests/test_platform_plugins.py | 2 + .../test_disable_sliding_window.py | 1 + tests/prefix_caching/test_prefix_caching.py | 1 + tests/prompt_adapter/test_bloom.py | 2 + .../test_multi_adapter_inference.py | 2 + tests/prompt_adapter/test_pa_lora.py | 2 + tests/quantization/test_bitsandbytes.py | 1 + tests/quantization/test_compressed_tensors.py | 1 + tests/quantization/test_configs.py | 1 + tests/quantization/test_cpu_offload.py | 2 + tests/quantization/test_experts_int8.py | 2 + tests/quantization/test_fp8.py | 1 + tests/quantization/test_ipex_quant.py | 1 + tests/quantization/test_lm_head.py | 1 + tests/quantization/test_quark.py | 1 + .../test_register_quantization_config.py | 1 + tests/quantization/utils.py | 2 + .../test_runai_model_streamer_loader.py | 2 + .../runai_model_streamer/test_weight_utils.py | 2 + tests/samplers/test_beam_search.py | 1 + tests/samplers/test_ignore_eos.py | 1 + tests/samplers/test_logits_processor.py | 2 + tests/samplers/test_logprobs.py | 2 + tests/samplers/test_no_bad_words.py | 1 + tests/samplers/test_ranks.py | 2 + tests/samplers/test_rejection_sampler.py | 1 + tests/samplers/test_sampler.py | 2 + tests/samplers/test_seeded_generate.py | 1 + .../test_typical_acceptance_sampler.py | 1 + tests/spec_decode/e2e/conftest.py | 2 + tests/spec_decode/e2e/test_compatibility.py | 2 + .../spec_decode/e2e/test_eagle_correctness.py | 1 + tests/spec_decode/e2e/test_integration.py | 1 + .../e2e/test_integration_dist_tp2.py | 1 + .../e2e/test_integration_dist_tp4.py | 1 + tests/spec_decode/e2e/test_logprobs.py | 2 + .../e2e/test_medusa_correctness.py | 1 + tests/spec_decode/e2e/test_mlp_correctness.py | 1 + .../e2e/test_multistep_correctness.py | 1 + .../spec_decode/e2e/test_ngram_correctness.py | 1 + tests/spec_decode/e2e/test_seed.py | 2 + tests/spec_decode/test_batch_expansion.py | 2 + tests/spec_decode/test_dynamic_spec_decode.py | 2 + tests/spec_decode/test_metrics.py | 2 + tests/spec_decode/test_multi_step_worker.py | 2 + tests/spec_decode/test_ngram_worker.py | 2 + tests/spec_decode/test_scorer.py | 2 + tests/spec_decode/test_spec_decode_worker.py | 2 + tests/spec_decode/test_utils.py | 2 + tests/spec_decode/utils.py | 2 + tests/standalone_tests/lazy_torch_compile.py | 2 + tests/tensorizer_loader/conftest.py | 2 + tests/tensorizer_loader/test_tensorizer.py | 2 + tests/test_cache_block_hashing.py | 1 + tests/test_config.py | 2 + tests/test_embedded_commit.py | 2 + tests/test_inputs.py | 2 + tests/test_logger.py | 2 + tests/test_logits_processor.py | 2 + tests/test_regression.py | 1 + tests/test_sampling_params.py | 1 + tests/test_scalartype.py | 2 + tests/test_sequence.py | 2 + tests/test_sharded_state_loader.py | 2 + tests/test_utils.py | 2 + tests/tokenization/test_cached_tokenizer.py | 2 + tests/tokenization/test_detokenize.py | 2 + tests/tokenization/test_get_eos.py | 1 + tests/tokenization/test_tokenizer.py | 2 + tests/tokenization/test_tokenizer_group.py | 2 + tests/tool_use/conftest.py | 2 + ...est_chat_completion_request_validations.py | 2 + tests/tool_use/test_chat_completions.py | 2 + tests/tool_use/test_jamba_tool_parser.py | 2 + tests/tool_use/test_parallel_tool_calls.py | 2 + tests/tool_use/test_tool_calls.py | 2 + tests/tool_use/utils.py | 2 + tests/tpu/test_compilation.py | 2 + tests/tpu/test_custom_dispatcher.py | 2 + tests/tpu/test_quantization_accuracy.py | 2 + tests/tracing/test_tracing.py | 2 + tests/utils.py | 2 + tests/v1/core/test_kv_cache_utils.py | 2 + tests/v1/core/test_prefix_caching.py | 1 + tests/v1/e2e/test_cascade_attention.py | 2 + tests/v1/engine/test_async_llm.py | 2 + tests/v1/engine/test_engine_args.py | 2 + tests/v1/engine/test_engine_core.py | 2 + tests/v1/engine/test_engine_core_client.py | 2 + tests/v1/engine/test_output_processor.py | 2 + tests/v1/sample/test_sampler.py | 2 + tests/v1/test_stats.py | 2 + tests/v1/test_utils.py | 2 + tests/v1/worker/test_gpu_input_batch.py | 2 + tests/vllm_test_utils/setup.py | 2 + .../vllm_test_utils/__init__.py | 1 + .../vllm_test_utils/vllm_test_utils/blame.py | 2 + .../vllm_test_utils/monitor.py | 2 + tests/weight_loading/test_weight_loading.py | 2 + .../test_encoder_decoder_model_runner.py | 2 + tests/worker/test_model_input.py | 2 + tests/worker/test_model_runner.py | 2 + tests/worker/test_profile.py | 2 + tests/worker/test_swap.py | 2 + tools/check_spdx_header.py | 43 +++++++++++++++++++ tools/profiler/print_layerwise_table.py | 2 + tools/profiler/visualize_layerwise_profile.py | 2 + tools/report_build_time_ninja.py | 2 + use_existing_torch.py | 2 + vllm/__init__.py | 1 + vllm/_custom_ops.py | 2 + vllm/_ipex_ops.py | 2 + vllm/adapter_commons/layers.py | 2 + vllm/adapter_commons/models.py | 2 + vllm/adapter_commons/request.py | 2 + vllm/adapter_commons/utils.py | 2 + vllm/adapter_commons/worker_manager.py | 2 + vllm/assets/audio.py | 2 + vllm/assets/base.py | 2 + vllm/assets/image.py | 2 + vllm/assets/video.py | 2 + vllm/attention/__init__.py | 2 + vllm/attention/backends/abstract.py | 2 + vllm/attention/backends/blocksparse_attn.py | 2 + vllm/attention/backends/flash_attn.py | 1 + vllm/attention/backends/flashinfer.py | 2 + vllm/attention/backends/hpu_attn.py | 2 + vllm/attention/backends/ipex_attn.py | 1 + vllm/attention/backends/mla/utils.py | 2 + vllm/attention/backends/openvino.py | 2 + vllm/attention/backends/pallas.py | 2 + vllm/attention/backends/placeholder_attn.py | 2 + vllm/attention/backends/rocm_flash_attn.py | 1 + vllm/attention/backends/torch_sdpa.py | 1 + vllm/attention/backends/triton_mla.py | 2 + vllm/attention/backends/utils.py | 1 + vllm/attention/backends/xformers.py | 1 + vllm/attention/layer.py | 1 + .../blocksparse_attention_kernel.py | 2 + .../ops/blocksparse_attention/interface.py | 2 + .../ops/blocksparse_attention/utils.py | 2 + vllm/attention/ops/hpu_paged_attn.py | 2 + vllm/attention/ops/ipex_attn.py | 2 + vllm/attention/ops/nki_flash_attn.py | 2 + vllm/attention/ops/paged_attn.py | 2 + vllm/attention/ops/prefix_prefill.py | 2 + vllm/attention/ops/triton_decode_attention.py | 2 + vllm/attention/ops/triton_flash_attention.py | 2 + vllm/attention/selector.py | 2 + vllm/beam_search.py | 2 + vllm/compilation/backends.py | 2 + vllm/compilation/counter.py | 2 + vllm/compilation/decorators.py | 2 + vllm/compilation/fix_functionalization.py | 2 + vllm/compilation/fusion.py | 2 + vllm/compilation/fx_utils.py | 2 + vllm/compilation/inductor_pass.py | 2 + vllm/compilation/monitor.py | 2 + vllm/compilation/multi_output_match.py | 2 + vllm/compilation/pass_manager.py | 2 + vllm/compilation/reshapes.py | 2 + vllm/compilation/vllm_inductor_pass.py | 2 + vllm/compilation/wrapper.py | 2 + vllm/config.py | 2 + vllm/connections.py | 2 + vllm/core/block/block_table.py | 2 + vllm/core/block/common.py | 2 + vllm/core/block/cpu_gpu_block_allocator.py | 2 + vllm/core/block/interfaces.py | 2 + vllm/core/block/naive_block.py | 2 + vllm/core/block/prefix_caching_block.py | 1 + vllm/core/block/utils.py | 1 + vllm/core/block_manager.py | 1 + vllm/core/evictor.py | 2 + vllm/core/interfaces.py | 2 + vllm/core/placeholder_block_space_manager.py | 2 + vllm/core/scheduler.py | 2 + vllm/device_allocator/cumem.py | 2 + vllm/distributed/__init__.py | 2 + vllm/distributed/communication_op.py | 2 + .../device_communicators/cuda_wrapper.py | 1 + .../device_communicators/custom_all_reduce.py | 2 + .../custom_all_reduce_utils.py | 2 + .../device_communicators/hpu_communicator.py | 2 + .../device_communicators/pynccl.py | 2 + .../device_communicators/pynccl_wrapper.py | 2 + .../device_communicators/shm_broadcast.py | 2 + .../device_communicators/tpu_communicator.py | 2 + .../device_communicators/xpu_communicator.py | 2 + .../kv_transfer/kv_connector/base.py | 1 + .../kv_transfer/kv_connector/factory.py | 2 + .../kv_connector/simple_connector.py | 1 + .../kv_transfer/kv_lookup_buffer/base.py | 1 + .../kv_lookup_buffer/simple_buffer.py | 1 + vllm/distributed/kv_transfer/kv_pipe/base.py | 1 + .../kv_transfer/kv_pipe/mooncake_pipe.py | 2 + .../kv_transfer/kv_pipe/pynccl_pipe.py | 1 + .../kv_transfer/kv_transfer_agent.py | 1 + vllm/distributed/parallel_state.py | 2 + vllm/distributed/utils.py | 2 + vllm/engine/arg_utils.py | 2 + vllm/engine/async_llm_engine.py | 2 + vllm/engine/async_timeout.py | 2 + vllm/engine/llm_engine.py | 2 + vllm/engine/metrics.py | 2 + vllm/engine/metrics_types.py | 1 + vllm/engine/multiprocessing/__init__.py | 2 + vllm/engine/multiprocessing/client.py | 2 + vllm/engine/multiprocessing/engine.py | 2 + vllm/engine/output_processor/interfaces.py | 2 + vllm/engine/output_processor/multi_step.py | 2 + vllm/engine/output_processor/single_step.py | 2 + vllm/engine/output_processor/stop_checker.py | 2 + vllm/engine/output_processor/util.py | 2 + vllm/engine/protocol.py | 2 + vllm/entrypoints/api_server.py | 1 + vllm/entrypoints/chat_utils.py | 2 + vllm/entrypoints/launcher.py | 2 + vllm/entrypoints/llm.py | 2 + vllm/entrypoints/logger.py | 2 + vllm/entrypoints/openai/api_server.py | 2 + vllm/entrypoints/openai/cli_args.py | 1 + vllm/entrypoints/openai/logits_processors.py | 2 + vllm/entrypoints/openai/protocol.py | 2 + .../openai/reasoning_parsers/__init__.py | 2 + .../abs_reasoning_parsers.py | 2 + .../deepseek_r1_reasoning_parser.py | 2 + vllm/entrypoints/openai/run_batch.py | 2 + vllm/entrypoints/openai/serving_chat.py | 2 + vllm/entrypoints/openai/serving_completion.py | 2 + vllm/entrypoints/openai/serving_embedding.py | 2 + vllm/entrypoints/openai/serving_engine.py | 2 + vllm/entrypoints/openai/serving_models.py | 2 + vllm/entrypoints/openai/serving_pooling.py | 2 + vllm/entrypoints/openai/serving_rerank.py | 2 + vllm/entrypoints/openai/serving_score.py | 2 + .../openai/serving_tokenization.py | 2 + .../openai/tool_parsers/__init__.py | 2 + .../tool_parsers/abstract_tool_parser.py | 2 + .../granite_20b_fc_tool_parser.py | 2 + .../tool_parsers/granite_tool_parser.py | 2 + .../openai/tool_parsers/hermes_tool_parser.py | 2 + .../tool_parsers/internlm2_tool_parser.py | 2 + .../openai/tool_parsers/jamba_tool_parser.py | 2 + .../openai/tool_parsers/llama_tool_parser.py | 2 + .../tool_parsers/mistral_tool_parser.py | 2 + .../tool_parsers/pythonic_tool_parser.py | 2 + vllm/entrypoints/openai/tool_parsers/utils.py | 2 + vllm/entrypoints/utils.py | 2 + vllm/envs.py | 2 + vllm/executor/executor_base.py | 2 + vllm/executor/mp_distributed_executor.py | 2 + vllm/executor/msgspec_utils.py | 2 + vllm/executor/multiproc_worker_utils.py | 2 + vllm/executor/ray_distributed_executor.py | 2 + vllm/executor/ray_utils.py | 2 + vllm/executor/uniproc_executor.py | 2 + vllm/forward_context.py | 2 + vllm/inputs/__init__.py | 2 + vllm/inputs/data.py | 2 + vllm/inputs/parse.py | 2 + vllm/inputs/preprocess.py | 2 + vllm/inputs/registry.py | 2 + vllm/logger.py | 1 + vllm/logging_utils/__init__.py | 2 + vllm/logging_utils/formatter.py | 2 + vllm/logits_process.py | 2 + vllm/lora/fully_sharded_layers.py | 2 + vllm/lora/layers.py | 2 + vllm/lora/lora.py | 2 + vllm/lora/models.py | 2 + vllm/lora/ops/torch_ops/__init__.py | 2 + vllm/lora/ops/torch_ops/lora_ops.py | 2 + vllm/lora/ops/triton_ops/__init__.py | 2 + vllm/lora/ops/triton_ops/bgmv_expand.py | 1 + vllm/lora/ops/triton_ops/bgmv_expand_slice.py | 1 + vllm/lora/ops/triton_ops/bgmv_shrink.py | 1 + vllm/lora/ops/triton_ops/sgmv_expand.py | 1 + vllm/lora/ops/triton_ops/sgmv_shrink.py | 1 + vllm/lora/ops/triton_ops/utils.py | 2 + vllm/lora/peft_helper.py | 2 + vllm/lora/punica_wrapper/__init__.py | 2 + vllm/lora/punica_wrapper/punica_base.py | 1 + vllm/lora/punica_wrapper/punica_cpu.py | 2 + vllm/lora/punica_wrapper/punica_gpu.py | 1 + vllm/lora/punica_wrapper/punica_hpu.py | 2 + vllm/lora/punica_wrapper/punica_selector.py | 2 + vllm/lora/punica_wrapper/utils.py | 2 + vllm/lora/request.py | 2 + vllm/lora/utils.py | 2 + vllm/lora/worker_manager.py | 2 + vllm/model_executor/__init__.py | 2 + vllm/model_executor/custom_op.py | 2 + .../guided_decoding/__init__.py | 2 + .../guided_decoding/guided_fields.py | 2 + .../lm_format_enforcer_decoding.py | 2 + .../guided_decoding/outlines_decoding.py | 2 + .../outlines_logits_processors.py | 2 + vllm/model_executor/guided_decoding/utils.py | 2 + .../guided_decoding/xgrammar_decoding.py | 2 + vllm/model_executor/layers/activation.py | 1 + .../layers/fused_moe/__init__.py | 2 + .../layers/fused_moe/fused_marlin_moe.py | 1 + .../layers/fused_moe/fused_moe.py | 1 + vllm/model_executor/layers/fused_moe/layer.py | 2 + .../layers/fused_moe/moe_pallas.py | 2 + .../layers/fused_moe/moe_torch_iterative.py | 2 + vllm/model_executor/layers/layernorm.py | 1 + vllm/model_executor/layers/linear.py | 2 + .../model_executor/layers/logits_processor.py | 1 + .../layers/mamba/mamba_mixer.py | 2 + .../layers/mamba/ops/causal_conv1d.py | 2 + .../layers/mamba/ops/mamba_ssm.py | 2 + vllm/model_executor/layers/pooler.py | 2 + .../layers/quantization/__init__.py | 2 + .../layers/quantization/aqlm.py | 2 + .../model_executor/layers/quantization/awq.py | 2 + .../layers/quantization/awq_marlin.py | 2 + .../layers/quantization/awq_triton.py | 2 + .../layers/quantization/base_config.py | 2 + .../layers/quantization/bitsandbytes.py | 2 + .../compressed_tensors/compressed_tensors.py | 2 + .../compressed_tensors_moe.py | 2 + .../compressed_tensors/schemes/__init__.py | 2 + .../schemes/compressed_tensors_24.py | 2 + .../schemes/compressed_tensors_scheme.py | 2 + .../schemes/compressed_tensors_w4a16_24.py | 2 + .../schemes/compressed_tensors_w8a16_fp8.py | 2 + .../schemes/compressed_tensors_w8a8_fp8.py | 2 + .../schemes/compressed_tensors_w8a8_int8.py | 2 + .../schemes/compressed_tensors_wNa16.py | 2 + .../compressed_tensors/triton_scaled_mm.py | 2 + .../quantization/compressed_tensors/utils.py | 2 + .../layers/quantization/deepspeedfp.py | 2 + .../layers/quantization/experts_int8.py | 2 + .../layers/quantization/fbgemm_fp8.py | 2 + .../model_executor/layers/quantization/fp8.py | 2 + .../layers/quantization/gguf.py | 2 + .../layers/quantization/gptq.py | 2 + .../layers/quantization/gptq_marlin.py | 2 + .../layers/quantization/gptq_marlin_24.py | 2 + .../layers/quantization/hqq_marlin.py | 2 + .../layers/quantization/ipex_quant.py | 2 + .../kernels/mixed_precision/MPLinearKernel.py | 2 + .../kernels/mixed_precision/__init__.py | 2 + .../kernels/mixed_precision/exllama.py | 2 + .../kernels/mixed_precision/machete.py | 2 + .../kernels/mixed_precision/marlin.py | 2 + .../kernels/scaled_mm/ScaledMMLinearKernel.py | 2 + .../kernels/scaled_mm/__init__.py | 2 + .../quantization/kernels/scaled_mm/cutlass.py | 2 + .../quantization/kernels/scaled_mm/triton.py | 2 + .../quantization/kernels/scaled_mm/xla.py | 2 + .../layers/quantization/kv_cache.py | 2 + .../layers/quantization/marlin.py | 2 + .../layers/quantization/modelopt.py | 2 + .../layers/quantization/moe_wna16.py | 2 + .../layers/quantization/neuron_quant.py | 2 + .../model_executor/layers/quantization/qqq.py | 2 + .../layers/quantization/quark/quark.py | 2 + .../layers/quantization/quark/quark_moe.py | 2 + .../quantization/quark/schemes/__init__.py | 2 + .../quark/schemes/quark_scheme.py | 2 + .../quark/schemes/quark_w8a8_fp8.py | 2 + .../quark/schemes/quark_w8a8_int8.py | 2 + .../layers/quantization/quark/utils.py | 2 + .../layers/quantization/schema.py | 1 + .../layers/quantization/tpu_int8.py | 2 + .../layers/quantization/utils/__init__.py | 2 + .../layers/quantization/utils/fp8_utils.py | 2 + .../layers/quantization/utils/layer_utils.py | 2 + .../quantization/utils/machete_utils.py | 2 + .../layers/quantization/utils/marlin_utils.py | 2 + .../quantization/utils/marlin_utils_fp8.py | 2 + .../quantization/utils/marlin_utils_test.py | 1 + .../utils/marlin_utils_test_24.py | 1 + .../utils/marlin_utils_test_qqq.py | 2 + .../layers/quantization/utils/quant_utils.py | 1 + .../layers/quantization/utils/w8a8_utils.py | 2 + .../layers/rejection_sampler.py | 2 + vllm/model_executor/layers/resampler.py | 2 + .../model_executor/layers/rotary_embedding.py | 2 + vllm/model_executor/layers/sampler.py | 1 + .../layers/spec_decode_base_sampler.py | 2 + .../layers/typical_acceptance_sampler.py | 2 + vllm/model_executor/layers/utils.py | 1 + .../layers/vocab_parallel_embedding.py | 2 + vllm/model_executor/model_loader/__init__.py | 2 + vllm/model_executor/model_loader/loader.py | 2 + vllm/model_executor/model_loader/neuron.py | 1 + vllm/model_executor/model_loader/openvino.py | 2 + .../model_executor/model_loader/tensorizer.py | 2 + vllm/model_executor/model_loader/utils.py | 1 + .../model_loader/weight_utils.py | 1 + vllm/model_executor/models/__init__.py | 2 + vllm/model_executor/models/adapters.py | 2 + vllm/model_executor/models/arctic.py | 1 + vllm/model_executor/models/aria.py | 2 + vllm/model_executor/models/baichuan.py | 2 + vllm/model_executor/models/bart.py | 2 + vllm/model_executor/models/bert.py | 2 + vllm/model_executor/models/blip.py | 1 + vllm/model_executor/models/blip2.py | 2 + vllm/model_executor/models/bloom.py | 2 + vllm/model_executor/models/chameleon.py | 2 + vllm/model_executor/models/chatglm.py | 2 + vllm/model_executor/models/clip.py | 1 + vllm/model_executor/models/commandr.py | 2 + vllm/model_executor/models/dbrx.py | 2 + vllm/model_executor/models/decilm.py | 2 + vllm/model_executor/models/deepseek.py | 2 + vllm/model_executor/models/deepseek_v2.py | 2 + vllm/model_executor/models/deepseek_v3.py | 2 + vllm/model_executor/models/deepseek_vl2.py | 2 + vllm/model_executor/models/eagle.py | 2 + vllm/model_executor/models/exaone.py | 2 + vllm/model_executor/models/fairseq2_llama.py | 2 + vllm/model_executor/models/falcon.py | 2 + vllm/model_executor/models/florence2.py | 2 + vllm/model_executor/models/fuyu.py | 2 + vllm/model_executor/models/gemma.py | 2 + vllm/model_executor/models/gemma2.py | 2 + vllm/model_executor/models/glm.py | 1 + .../models/glm4_vision_encoder.py | 2 + vllm/model_executor/models/gpt2.py | 2 + vllm/model_executor/models/gpt_bigcode.py | 2 + vllm/model_executor/models/gpt_j.py | 2 + vllm/model_executor/models/gpt_neox.py | 2 + vllm/model_executor/models/granite.py | 2 + vllm/model_executor/models/granitemoe.py | 2 + vllm/model_executor/models/gritlm.py | 2 + vllm/model_executor/models/h2ovl.py | 2 + .../models/idefics2_vision_model.py | 2 + vllm/model_executor/models/idefics3.py | 2 + vllm/model_executor/models/interfaces.py | 2 + vllm/model_executor/models/interfaces_base.py | 2 + vllm/model_executor/models/intern_vit.py | 2 + vllm/model_executor/models/internlm2.py | 2 + vllm/model_executor/models/internlm2_ve.py | 2 + vllm/model_executor/models/internvl.py | 2 + vllm/model_executor/models/jais.py | 2 + vllm/model_executor/models/jamba.py | 1 + vllm/model_executor/models/llama.py | 2 + vllm/model_executor/models/llava.py | 2 + vllm/model_executor/models/llava_next.py | 2 + .../model_executor/models/llava_next_video.py | 2 + vllm/model_executor/models/llava_onevision.py | 2 + vllm/model_executor/models/mamba.py | 1 + vllm/model_executor/models/mamba_cache.py | 2 + vllm/model_executor/models/medusa.py | 2 + vllm/model_executor/models/minicpm.py | 2 + vllm/model_executor/models/minicpm3.py | 2 + vllm/model_executor/models/minicpmo.py | 2 + vllm/model_executor/models/minicpmv.py | 2 + vllm/model_executor/models/mixtral.py | 2 + vllm/model_executor/models/mixtral_quant.py | 2 + vllm/model_executor/models/mllama.py | 2 + vllm/model_executor/models/mlp_speculator.py | 2 + vllm/model_executor/models/module_mapping.py | 2 + vllm/model_executor/models/molmo.py | 2 + vllm/model_executor/models/mpt.py | 2 + vllm/model_executor/models/nemotron.py | 2 + vllm/model_executor/models/nvlm_d.py | 2 + vllm/model_executor/models/olmo.py | 2 + vllm/model_executor/models/olmo2.py | 2 + vllm/model_executor/models/olmoe.py | 2 + vllm/model_executor/models/opt.py | 2 + vllm/model_executor/models/orion.py | 2 + vllm/model_executor/models/paligemma.py | 2 + vllm/model_executor/models/persimmon.py | 2 + vllm/model_executor/models/phi.py | 2 + vllm/model_executor/models/phi3.py | 2 + vllm/model_executor/models/phi3_small.py | 2 + vllm/model_executor/models/phi3v.py | 2 + vllm/model_executor/models/phimoe.py | 2 + vllm/model_executor/models/pixtral.py | 2 + vllm/model_executor/models/qwen.py | 2 + vllm/model_executor/models/qwen2.py | 2 + vllm/model_executor/models/qwen2_audio.py | 2 + vllm/model_executor/models/qwen2_moe.py | 2 + vllm/model_executor/models/qwen2_rm.py | 2 + vllm/model_executor/models/qwen2_vl.py | 2 + vllm/model_executor/models/registry.py | 1 + vllm/model_executor/models/roberta.py | 2 + vllm/model_executor/models/siglip.py | 1 + vllm/model_executor/models/solar.py | 2 + vllm/model_executor/models/stablelm.py | 2 + vllm/model_executor/models/starcoder2.py | 2 + vllm/model_executor/models/telechat2.py | 2 + vllm/model_executor/models/ultravox.py | 2 + vllm/model_executor/models/utils.py | 2 + vllm/model_executor/models/vision.py | 2 + vllm/model_executor/models/whisper.py | 2 + vllm/model_executor/parameter.py | 2 + vllm/model_executor/pooling_metadata.py | 2 + vllm/model_executor/sampling_metadata.py | 2 + vllm/model_executor/utils.py | 1 + vllm/multimodal/__init__.py | 2 + vllm/multimodal/audio.py | 2 + vllm/multimodal/base.py | 2 + vllm/multimodal/hasher.py | 2 + vllm/multimodal/image.py | 2 + vllm/multimodal/inputs.py | 2 + vllm/multimodal/parse.py | 2 + vllm/multimodal/processing.py | 2 + vllm/multimodal/profiling.py | 2 + vllm/multimodal/registry.py | 2 + vllm/multimodal/utils.py | 2 + vllm/multimodal/video.py | 2 + vllm/outputs.py | 2 + vllm/platforms/__init__.py | 2 + vllm/platforms/cpu.py | 2 + vllm/platforms/cuda.py | 1 + vllm/platforms/hpu.py | 2 + vllm/platforms/interface.py | 2 + vllm/platforms/neuron.py | 2 + vllm/platforms/openvino.py | 2 + vllm/platforms/rocm.py | 2 + vllm/platforms/tpu.py | 2 + vllm/platforms/xpu.py | 2 + vllm/plugins/__init__.py | 2 + vllm/pooling_params.py | 2 + vllm/profiler/__init__.py | 2 + vllm/profiler/layerwise_profile.py | 2 + vllm/profiler/utils.py | 2 + vllm/prompt_adapter/layers.py | 2 + vllm/prompt_adapter/models.py | 2 + vllm/prompt_adapter/request.py | 2 + vllm/prompt_adapter/utils.py | 2 + vllm/prompt_adapter/worker_manager.py | 2 + vllm/sampling_params.py | 1 + vllm/scalar_type.py | 2 + vllm/scripts.py | 2 + vllm/sequence.py | 1 + vllm/spec_decode/batch_expansion.py | 2 + vllm/spec_decode/draft_model_runner.py | 2 + vllm/spec_decode/interfaces.py | 2 + vllm/spec_decode/medusa_worker.py | 2 + vllm/spec_decode/metrics.py | 2 + vllm/spec_decode/mlp_speculator_worker.py | 2 + vllm/spec_decode/mqa_scorer.py | 2 + vllm/spec_decode/multi_step_worker.py | 2 + vllm/spec_decode/ngram_worker.py | 2 + vllm/spec_decode/proposer_worker_base.py | 2 + .../spec_decode/smaller_tp_proposer_worker.py | 2 + vllm/spec_decode/spec_decode_worker.py | 2 + vllm/spec_decode/target_model_runner.py | 2 + vllm/spec_decode/top1_proposer.py | 2 + vllm/spec_decode/util.py | 2 + vllm/tracing.py | 2 + vllm/transformers_utils/__init__.py | 2 + vllm/transformers_utils/config.py | 2 + vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/arctic.py | 2 + vllm/transformers_utils/configs/chatglm.py | 2 + vllm/transformers_utils/configs/cohere2.py | 2 + vllm/transformers_utils/configs/dbrx.py | 2 + .../configs/deepseek_vl2.py | 2 + vllm/transformers_utils/configs/eagle.py | 2 + vllm/transformers_utils/configs/exaone.py | 2 + vllm/transformers_utils/configs/falcon.py | 2 + vllm/transformers_utils/configs/h2ovl.py | 2 + vllm/transformers_utils/configs/internvl.py | 2 + vllm/transformers_utils/configs/jais.py | 2 + vllm/transformers_utils/configs/medusa.py | 2 + vllm/transformers_utils/configs/mllama.py | 2 + .../configs/mlp_speculator.py | 2 + vllm/transformers_utils/configs/mpt.py | 2 + vllm/transformers_utils/configs/nemotron.py | 2 + vllm/transformers_utils/configs/nvlm_d.py | 2 + vllm/transformers_utils/configs/olmo2.py | 2 + vllm/transformers_utils/configs/solar.py | 2 + vllm/transformers_utils/configs/telechat2.py | 2 + vllm/transformers_utils/configs/ultravox.py | 2 + vllm/transformers_utils/detokenizer.py | 2 + vllm/transformers_utils/detokenizer_utils.py | 2 + vllm/transformers_utils/processor.py | 2 + .../transformers_utils/processors/__init__.py | 2 + .../processors/deepseek_vl2.py | 2 + vllm/transformers_utils/s3_utils.py | 2 + vllm/transformers_utils/tokenizer.py | 2 + .../tokenizer_group/__init__.py | 2 + .../tokenizer_group/base_tokenizer_group.py | 2 + .../tokenizer_group/ray_tokenizer_group.py | 2 + .../tokenizer_group/tokenizer_group.py | 2 + .../transformers_utils/tokenizers/__init__.py | 2 + vllm/transformers_utils/tokenizers/mistral.py | 2 + vllm/transformers_utils/utils.py | 2 + vllm/triton_utils/__init__.py | 2 + vllm/triton_utils/custom_cache_manager.py | 2 + vllm/triton_utils/importing.py | 2 + vllm/usage/usage_lib.py | 2 + vllm/utils.py | 2 + vllm/v1/attention/backends/flash_attn.py | 1 + vllm/v1/core/encoder_cache_manager.py | 2 + vllm/v1/core/kv_cache_manager.py | 2 + vllm/v1/core/kv_cache_utils.py | 1 + vllm/v1/core/scheduler.py | 2 + vllm/v1/engine/__init__.py | 2 + vllm/v1/engine/async_llm.py | 2 + vllm/v1/engine/core.py | 2 + vllm/v1/engine/core_client.py | 2 + vllm/v1/engine/detokenizer.py | 2 + vllm/v1/engine/llm_engine.py | 2 + vllm/v1/engine/mm_input_mapper.py | 2 + vllm/v1/engine/output_processor.py | 2 + vllm/v1/engine/processor.py | 2 + vllm/v1/executor/abstract.py | 2 + vllm/v1/executor/multiproc_executor.py | 2 + vllm/v1/kv_cache_interface.py | 2 + vllm/v1/metrics/loggers.py | 2 + vllm/v1/metrics/stats.py | 2 + vllm/v1/outputs.py | 2 + vllm/v1/request.py | 2 + vllm/v1/sample/metadata.py | 2 + vllm/v1/sample/ops/penalties.py | 2 + vllm/v1/sample/ops/topk_topp_sampler.py | 2 + vllm/v1/sample/sampler.py | 1 + vllm/v1/serial_utils.py | 2 + vllm/v1/stats/common.py | 2 + vllm/v1/utils.py | 2 + vllm/v1/worker/block_table.py | 2 + vllm/v1/worker/gpu_input_batch.py | 2 + vllm/v1/worker/gpu_model_runner.py | 2 + vllm/v1/worker/gpu_worker.py | 1 + vllm/version.py | 2 + vllm/worker/cache_engine.py | 1 + vllm/worker/cpu_enc_dec_model_runner.py | 2 + vllm/worker/cpu_model_runner.py | 2 + vllm/worker/cpu_pooling_model_runner.py | 2 + vllm/worker/cpu_worker.py | 1 + vllm/worker/enc_dec_model_runner.py | 2 + vllm/worker/hpu_model_runner.py | 2 + vllm/worker/hpu_worker.py | 2 + vllm/worker/model_runner.py | 2 + vllm/worker/model_runner_base.py | 2 + vllm/worker/multi_step_model_runner.py | 2 + vllm/worker/multi_step_tpu_worker.py | 2 + vllm/worker/multi_step_worker.py | 2 + vllm/worker/neuron_model_runner.py | 2 + vllm/worker/neuron_worker.py | 1 + vllm/worker/openvino_model_runner.py | 2 + vllm/worker/openvino_worker.py | 1 + vllm/worker/pooling_model_runner.py | 2 + vllm/worker/tpu_model_runner.py | 2 + vllm/worker/tpu_worker.py | 2 + vllm/worker/utils.py | 1 + vllm/worker/worker.py | 1 + vllm/worker/worker_base.py | 2 + vllm/worker/xpu_model_runner.py | 2 + vllm/worker/xpu_worker.py | 1 + 1012 files changed, 1884 insertions(+), 2 deletions(-) create mode 100644 tools/check_spdx_header.py diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index e29eb78a9..2e4aecdd3 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import sys import zipfile diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py index 8350e2705..36e1b6c01 100644 --- a/.buildkite/generate_index.py +++ b/.buildkite/generate_index.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import os diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index afc935c1a..96e57dfd0 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ LM eval harness on model to compare vs HF baseline computed offline. Configs are found in configs/$MODEL.yaml diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 9d3646e2f..e031686c7 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import os from pathlib import Path diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py index 68ac5909e..5e17b79d2 100644 --- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py +++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse from transformers import AutoTokenizer diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py index 052060c57..0ff95a091 100644 --- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import json from pathlib import Path diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py index 18bcc3a87..e5f179a0f 100644 --- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py +++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from lmdeploy.serve.openai.api_client import APIClient api_client = APIClient("http://localhost:8000") diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 92d6fad73..62ee5e10b 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import datetime import json import os diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ae518e190..4568efcbb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -97,10 +97,14 @@ repos: language: system verbose: true stages: [commit-msg] + - id: check-spdx-header + name: Check SPDX headers + entry: python tools/check_spdx_header.py + language: python + types: [python] - id: suggestion name: Suggestion entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."' language: system verbose: true pass_filenames: false - diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 0612e8778..364b087b8 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import os import sys diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py index 1a0e62598..2b41834ba 100644 --- a/benchmarks/benchmark_guided.py +++ b/benchmarks/benchmark_guided.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Benchmark guided decoding throughput.""" import argparse import dataclasses diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 77c4f6aa9..896312945 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Benchmark the latency of processing a single batch of requests.""" import argparse import dataclasses diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 0b8fba381..21480578e 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Offline benchmark to test the long document QA throughput. diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 3ab421a89..23822856b 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Benchmark the efficiency of prefix caching. diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index e0c9e6a6d..a32065e4e 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Benchmark offline prioritization.""" import argparse import dataclasses diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 8b3212831..e934d228f 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 r"""Benchmark online serving throughput. On the server side, run one of the following commands: diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py index 4435d87e1..561e500d8 100644 --- a/benchmarks/benchmark_serving_guided.py +++ b/benchmarks/benchmark_serving_guided.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 r"""Benchmark online serving throughput with guided decoding. On the server side, run one of the following commands: diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index c1b10b3cf..658eab6a2 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Benchmark offline inference throughput.""" import argparse import dataclasses diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py index 3d1c5e392..468a1b286 100644 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import copy import itertools diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py index ef06fcd66..bab377800 100644 --- a/benchmarks/cutlass_benchmarks/utils.py +++ b/benchmarks/cutlass_benchmarks/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Cutlass bench utils from typing import Iterable, Tuple diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index b87496ca3..6552b62da 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import copy import itertools diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py index d58fb0bf8..3d1121df4 100644 --- a/benchmarks/cutlass_benchmarks/weight_shapes.py +++ b/benchmarks/cutlass_benchmarks/weight_shapes.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Weight Shapes are in the format # ([K, N], TP_SPLIT_DIM) # Example: diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py index 4058b1c0a..980e68668 100644 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import aiohttp diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py index 6eb5f6398..c2ad4916b 100644 --- a/benchmarks/disagg_benchmarks/round_robin_proxy.py +++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import itertools diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py index e59d8bb0e..a7b4b9e8b 100644 --- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py +++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import matplotlib.pyplot as plt diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index ef91f9f8e..c56cc7438 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pickle as pkl import time from dataclasses import dataclass diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index 601c4ea43..8d20b9156 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import sys from typing import Optional diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index 7acea6087..d265c91bf 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time import torch diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index e1f613e1d..ecde8fbaa 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import copy import json diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index 46bab74ae..0301fee1a 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import copy import itertools diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 8fb44e3a3..c22e66c0b 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import torch diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 068830f02..a4a45c9cb 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import time from datetime import datetime diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 219013a38..daedaadb1 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random import time from typing import List, Optional diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 1d6248344..0ddea9390 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time import torch diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py index baa5de0ff..dba153742 100644 --- a/benchmarks/kernels/benchmark_rmsnorm.py +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools from typing import Optional, Tuple, Union diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 250d50516..8ee0212a0 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from itertools import accumulate from typing import List, Optional diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py index 4eeeca35a..c375e61e4 100644 --- a/benchmarks/kernels/benchmark_shapes.py +++ b/benchmarks/kernels/benchmark_shapes.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + WEIGHT_SHAPES = { "ideal": [[4 * 256 * 32, 256 * 32]], "mistralai/Mistral-7B-v0.1/TP1": [ diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index 7d0bd8415..01d97d63d 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math import pickle import re diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py index fee877b6f..728170748 100644 --- a/benchmarks/kernels/utils.py +++ b/benchmarks/kernels/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from typing import Any, Callable, Iterable, Optional diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py index 51f24f3ba..89b05d588 100644 --- a/benchmarks/kernels/weight_shapes.py +++ b/benchmarks/kernels/weight_shapes.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Weight Shapes are in the format # ([K, N], TP_SPLIT_DIM) # Example: diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index d16d6f9fb..5f94552e9 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import cProfile import pstats diff --git a/cmake/hipify.py b/cmake/hipify.py index 340e41c81..2e0c8a172 100755 --- a/cmake/hipify.py +++ b/cmake/hipify.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + #!/usr/bin/env python3 # diff --git a/collect_env.py b/collect_env.py index 254c19b19..0ec9d4cae 100644 --- a/collect_env.py +++ b/collect_env.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # ruff: noqa # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py index b401736c9..d5a5e2ef8 100644 --- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py +++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum from typing import Dict, Union diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index a9b5ddf4c..02e59fe28 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools import math import os diff --git a/docs/source/conf.py b/docs/source/conf.py index 6b0a1dad1..ea3b56e02 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index ac592e223..9d4de18a3 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools import re from dataclasses import dataclass, field diff --git a/examples/offline_inference/aqlm_example.py b/examples/offline_inference/aqlm_example.py index 40f9a21ec..e8db3811f 100644 --- a/examples/offline_inference/aqlm_example.py +++ b/examples/offline_inference/aqlm_example.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams from vllm.utils import FlexibleArgumentParser diff --git a/examples/offline_inference/arctic.py b/examples/offline_inference/arctic.py index 1fec3c99e..90c88446c 100644 --- a/examples/offline_inference/arctic.py +++ b/examples/offline_inference/arctic.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams # Sample prompts. diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index 5952ec13e..707ca9f87 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This example shows how to use vLLM for running offline inference with the correct prompt format on audio language models. diff --git a/examples/offline_inference/basic.py b/examples/offline_inference/basic.py index 23cc6e853..a6e96c0bb 100644 --- a/examples/offline_inference/basic.py +++ b/examples/offline_inference/basic.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams # Sample prompts. diff --git a/examples/offline_inference/basic_with_model_default_sampling.py b/examples/offline_inference/basic_with_model_default_sampling.py index 346bb80b1..80de9428f 100644 --- a/examples/offline_inference/basic_with_model_default_sampling.py +++ b/examples/offline_inference/basic_with_model_default_sampling.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM # Sample prompts. diff --git a/examples/offline_inference/chat.py b/examples/offline_inference/chat.py index 8814f4d7b..dbc710cc8 100644 --- a/examples/offline_inference/chat.py +++ b/examples/offline_inference/chat.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py index e69a6c067..15519bfed 100644 --- a/examples/offline_inference/chat_with_tools.py +++ b/examples/offline_inference/chat_with_tools.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # ruff: noqa import json import random diff --git a/examples/offline_inference/classification.py b/examples/offline_inference/classification.py index de539b639..4a364aeb8 100644 --- a/examples/offline_inference/classification.py +++ b/examples/offline_inference/classification.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM # Sample prompts. diff --git a/examples/offline_inference/cli.py b/examples/offline_inference/cli.py index 391ac6b9b..bc6833b3f 100644 --- a/examples/offline_inference/cli.py +++ b/examples/offline_inference/cli.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import asdict from vllm import LLM, SamplingParams diff --git a/examples/offline_inference/cpu_offload.py b/examples/offline_inference/cpu_offload.py index b152e5bc3..5511eb738 100644 --- a/examples/offline_inference/cpu_offload.py +++ b/examples/offline_inference/cpu_offload.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams # Sample prompts. diff --git a/examples/offline_inference/distributed.py b/examples/offline_inference/distributed.py index 677127844..a2df41d4c 100644 --- a/examples/offline_inference/distributed.py +++ b/examples/offline_inference/distributed.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This example shows how to use Ray Data for running offline batch inference distributively on a multi-nodes cluster. diff --git a/examples/offline_inference/embedding.py b/examples/offline_inference/embedding.py index 58d004313..f9399329d 100644 --- a/examples/offline_inference/embedding.py +++ b/examples/offline_inference/embedding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM # Sample prompts. diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py index 0f266d791..8765d1812 100644 --- a/examples/offline_inference/encoder_decoder.py +++ b/examples/offline_inference/encoder_decoder.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 ''' Demonstrate prompting of text-to-text encoder/decoder models, specifically BART diff --git a/examples/offline_inference/florence2_inference.py b/examples/offline_inference/florence2_inference.py index c24096e90..58610b0fd 100644 --- a/examples/offline_inference/florence2_inference.py +++ b/examples/offline_inference/florence2_inference.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 ''' Demonstrate prompting of text-to-text encoder/decoder models, specifically Florence-2 diff --git a/examples/offline_inference/gguf_inference.py b/examples/offline_inference/gguf_inference.py index aa05c4c0b..0447e74e0 100644 --- a/examples/offline_inference/gguf_inference.py +++ b/examples/offline_inference/gguf_inference.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from huggingface_hub import hf_hub_download from vllm import LLM, SamplingParams diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py index 60d894aae..501034c1c 100644 --- a/examples/offline_inference/llm_engine_example.py +++ b/examples/offline_inference/llm_engine_example.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse from typing import List, Tuple diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py index 0c454ea50..de0734c1a 100644 --- a/examples/offline_inference/lora_with_quantization_inference.py +++ b/examples/offline_inference/lora_with_quantization_inference.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This example shows how to use LoRA with different quantization techniques for offline inference. diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py index 8f0eb65e4..10d9de8cb 100644 --- a/examples/offline_inference/mlpspeculator.py +++ b/examples/offline_inference/mlpspeculator.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import gc import time from typing import List diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py index 043220d97..630fd1bf8 100644 --- a/examples/offline_inference/multilora_inference.py +++ b/examples/offline_inference/multilora_inference.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This example shows how to use the multi-LoRA functionality for offline inference. diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py index f098c8e5f..517d1bfce 100644 --- a/examples/offline_inference/neuron.py +++ b/examples/offline_inference/neuron.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams # Sample prompts. diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py index 8ec17e340..c899a01a0 100644 --- a/examples/offline_inference/neuron_int8_quantization.py +++ b/examples/offline_inference/neuron_int8_quantization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from vllm import LLM, SamplingParams diff --git a/examples/offline_inference/pixtral.py b/examples/offline_inference/pixtral.py index c12ff7021..760de1145 100644 --- a/examples/offline_inference/pixtral.py +++ b/examples/offline_inference/pixtral.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # ruff: noqa import argparse diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py index 67b755a15..4c326c417 100644 --- a/examples/offline_inference/prefix_caching.py +++ b/examples/offline_inference/prefix_caching.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams from vllm.distributed import cleanup_dist_env_and_memory diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py index 8a94b5c2a..c2e072fdd 100644 --- a/examples/offline_inference/profiling.py +++ b/examples/offline_inference/profiling.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import inspect import json import os diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py index d7423e6c6..b1fe829b3 100644 --- a/examples/offline_inference/profiling_tpu/profiling.py +++ b/examples/offline_inference/profiling_tpu/profiling.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import dataclasses import os diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py index 5c4918008..5000251c0 100644 --- a/examples/offline_inference/rlhf.py +++ b/examples/offline_inference/rlhf.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ a simple demonstration of RLHF with vLLM, inspired by the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF . diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py index 4207f8922..863276432 100644 --- a/examples/offline_inference/save_sharded_state.py +++ b/examples/offline_inference/save_sharded_state.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Saves each worker's model state dict directly to a checkpoint, which enables a fast load path for large tensor-parallel models where each worker only needs to diff --git a/examples/offline_inference/scoring.py b/examples/offline_inference/scoring.py index 5da9e7109..7daa82b82 100644 --- a/examples/offline_inference/scoring.py +++ b/examples/offline_inference/scoring.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM # Sample prompts. diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py index abcfa8e8f..b45954b3b 100644 --- a/examples/offline_inference/simple_profiling.py +++ b/examples/offline_inference/simple_profiling.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import time diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py index 00d864606..38ffd7fb9 100644 --- a/examples/offline_inference/structured_outputs.py +++ b/examples/offline_inference/structured_outputs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from enum import Enum from pydantic import BaseModel diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py index b6de73eb7..35df60115 100644 --- a/examples/offline_inference/torchrun_example.py +++ b/examples/offline_inference/torchrun_example.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ experimental support for tensor-parallel inference with torchrun, see https://github.com/vllm-project/vllm/issues/11400 for diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py index 251629b80..bd0e98462 100644 --- a/examples/offline_inference/tpu.py +++ b/examples/offline_inference/tpu.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams prompts = [ diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 38c2b13d3..65940b6ad 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This example shows how to use vLLM for running offline inference with the correct prompt format on vision language models for text generation. diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py index 4ce3d496b..3075fbbfa 100644 --- a/examples/offline_inference/vision_language_embedding.py +++ b/examples/offline_inference/vision_language_embedding.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This example shows how to use vLLM for running offline inference with the correct prompt format on vision language models for multimodal embedding. diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 43c44fa86..601ac96e1 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This example shows how to use vLLM for running offline inference with multi-image input on vision language models for text generation, diff --git a/examples/offline_inference/whisper.py b/examples/offline_inference/whisper.py index 087ad4376..59c119a77 100644 --- a/examples/offline_inference/whisper.py +++ b/examples/offline_inference/whisper.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from vllm import LLM, SamplingParams diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py index 49a085feb..623e0d59a 100644 --- a/examples/online_serving/api_client.py +++ b/examples/online_serving/api_client.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Example Python client for `vllm.entrypoints.api_server` NOTE: The API server is used only for demonstration and simple performance benchmarks. It is not intended for production use. diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py index a07affe33..fc434ada1 100644 --- a/examples/online_serving/cohere_rerank_client.py +++ b/examples/online_serving/cohere_rerank_client.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Example of using the OpenAI entrypoint's rerank API which is compatible with the Cohere SDK: https://github.com/cohere-ai/cohere-python diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py index 8ceb8f68e..ee01e1eae 100644 --- a/examples/online_serving/gradio_openai_chatbot_webserver.py +++ b/examples/online_serving/gradio_openai_chatbot_webserver.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import gradio as gr diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py index 54e907582..c619146b0 100644 --- a/examples/online_serving/gradio_webserver.py +++ b/examples/online_serving/gradio_webserver.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import json diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py index bf4de76dd..3e760e171 100644 --- a/examples/online_serving/jinaai_rerank_client.py +++ b/examples/online_serving/jinaai_rerank_client.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Example of using the OpenAI entrypoint's rerank API which is compatible with Jina and Cohere https://jina.ai/reranker diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py index bbada3891..a81562041 100644 --- a/examples/online_serving/openai_chat_completion_client.py +++ b/examples/online_serving/openai_chat_completion_client.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from openai import OpenAI # Modify OpenAI's API key and API base to use vLLM's API server. diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index 03cc037bb..d5f798a8d 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """An example showing how to use vLLM to serve multimodal models and run online serving with OpenAI client. diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py index 2bbe42b6b..416fb61ca 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Set up this example by starting a vLLM OpenAI-compatible server with tool call options enabled. For example: diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py index 8c059c7ca..cddd93180 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from enum import Enum from openai import OpenAI diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py index 83e51a48b..a88c8adb5 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ An example shows how to generate chat completions from reasoning models like DeepSeekR1. diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py index 8c14aac6b..489bfcd5e 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ An example shows how to generate chat completions from reasoning models like DeepSeekR1. diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py index a56e7429b..f49d7a228 100644 --- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import base64 import io diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py index 58519f978..06b93d7d1 100644 --- a/examples/online_serving/openai_completion_client.py +++ b/examples/online_serving/openai_completion_client.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from openai import OpenAI # Modify OpenAI's API key and API base to use vLLM's API server. diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py index 365a684d5..67c5fc91b 100644 --- a/examples/online_serving/openai_cross_encoder_score.py +++ b/examples/online_serving/openai_cross_encoder_score.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Example online usage of Score API. diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py index 4bd7ca01d..cb1109974 100644 --- a/examples/online_serving/openai_embedding_client.py +++ b/examples/online_serving/openai_embedding_client.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from openai import OpenAI # Modify OpenAI's API key and API base to use vLLM's API server. diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py index 37ec8f2fb..e17f9c5ef 100644 --- a/examples/online_serving/openai_pooling_client.py +++ b/examples/online_serving/openai_pooling_client.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Example online usage of Pooling API. diff --git a/examples/online_serving/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py index b1a2b3c3c..7a605f85b 100644 --- a/examples/online_serving/opentelemetry/dummy_client.py +++ b/examples/online_serving/opentelemetry/dummy_client.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import requests from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( OTLPSpanExporter) diff --git a/examples/other/tensorize_vllm_model.py b/examples/other/tensorize_vllm_model.py index 5fff1fdf5..68345e6cb 100644 --- a/examples/other/tensorize_vllm_model.py +++ b/examples/other/tensorize_vllm_model.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import dataclasses import json diff --git a/find_cuda_init.py b/find_cuda_init.py index 51db23102..0d13b2f86 100644 --- a/find_cuda_init.py +++ b/find_cuda_init.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import importlib import traceback from typing import Callable diff --git a/python_only_dev.py b/python_only_dev.py index 7d95ac96e..a303697b7 100644 --- a/python_only_dev.py +++ b/python_only_dev.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation). TL;DR: diff --git a/setup.py b/setup.py index 50a2392a4..50265d46e 100755 --- a/setup.py +++ b/setup.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import ctypes import importlib.util import logging diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py index a3c9d5c6e..d9ac61164 100644 --- a/tests/async_engine/api_server_async_engine.py +++ b/tests/async_engine/api_server_async_engine.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """vllm.entrypoints.api_server with some extra logging for testing.""" from typing import Any, Dict, Iterable diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 91ac35dd6..77f3fb002 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import subprocess import sys import time diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 8a04693ba..ca29abc92 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os import uuid diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py index 5668cc30d..fd6d89d4e 100644 --- a/tests/async_engine/test_request_tracker.py +++ b/tests/async_engine/test_request_tracker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.engine.async_llm_engine import RequestTracker diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 232850406..2792dfde7 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the short outputs of HF and vLLM when using greedy sampling. Run `pytest tests/basic_correctness/test_basic_correctness.py`. diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index 469d18a4d..cefd54d1c 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM when using greedy sampling. It tests chunked prefill. Chunked prefill can be enabled by diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py index d7f36a781..b4d558ce2 100644 --- a/tests/basic_correctness/test_cpu_offload.py +++ b/tests/basic_correctness/test_cpu_offload.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from ..utils import compare_two_settings diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index 53f4ef08f..da9239b09 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from vllm import LLM, SamplingParams diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 4b27dcbc8..6aaec6eef 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the short outputs of HF and vLLM when using greedy sampling. VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test. diff --git a/tests/compile/backend.py b/tests/compile/backend.py index 8fa10e5bd..74bc58a2d 100644 --- a/tests/compile/backend.py +++ b/tests/compile/backend.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from copy import deepcopy from typing import Callable, Union diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index aa1152481..9d633ad25 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Test the piecewise compilation with a simple model so that we can exactly calculate the expected output and side effects. diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index d4ede4d23..0404722ba 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Test the piecewise compilation with a simple model, comparing the output with and without the piecewise compilation. diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 1945479fc..d7acec690 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from typing import Dict, List, Optional diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 4dfdfe21a..6e83fa368 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.config import CompilationLevel diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index ea3aaee95..8f5040522 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index b4266a4a7..c14f0caab 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch from compressed_tensors.quantization import FP8_DTYPE diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py index 03e753509..70920ab10 100644 --- a/tests/compile/test_pass_manager.py +++ b/tests/compile/test_pass_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pickle import pytest diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py index 74f66baaa..0934c6113 100644 --- a/tests/compile/test_wrapper.py +++ b/tests/compile/test_wrapper.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional import torch diff --git a/tests/compile/utils.py b/tests/compile/utils.py index 7c92d165d..e4a88584e 100644 --- a/tests/compile/utils.py +++ b/tests/compile/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import torch diff --git a/tests/conftest.py b/tests/conftest.py index 279c1bf9a..85dd5bcb0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import os import tempfile diff --git a/tests/core/block/conftest.py b/tests/core/block/conftest.py index 0464d6a74..b7a9863f4 100644 --- a/tests/core/block/conftest.py +++ b/tests/core/block/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py index 70577ec05..7d3ccaada 100644 --- a/tests/core/block/e2e/conftest.py +++ b/tests/core/block/e2e/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, Iterable, Optional import pytest diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 86502f613..e9b537ed5 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from itertools import cycle import pytest diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index 415d0bd82..c874608e4 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import List diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py index cfd749ad5..68d9618ae 100644 --- a/tests/core/block/test_block_manager.py +++ b/tests/core/block/test_block_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py index e2391a568..d8cf0bec7 100644 --- a/tests/core/block/test_block_table.py +++ b/tests/core/block/test_block_table.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/core/block/test_common.py b/tests/core/block/test_common.py index cfdd3582e..202608730 100644 --- a/tests/core/block/test_common.py +++ b/tests/core/block/test_common.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random import pytest diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py index a9e38d404..a1414edd9 100644 --- a/tests/core/block/test_cpu_gpu_block_allocator.py +++ b/tests/core/block/test_cpu_gpu_block_allocator.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py index 10d5964dc..0ca2a0b80 100644 --- a/tests/core/block/test_naive_block.py +++ b/tests/core/block/test_naive_block.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional import pytest diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index 6642174c1..771627a57 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math import random from typing import List, Optional diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index eaaf004df..8da25aea4 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List from unittest.mock import MagicMock diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py index bd4accab7..a4a901444 100644 --- a/tests/core/test_num_computed_tokens_update.py +++ b/tests/core/test_num_computed_tokens_update.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from tests.conftest import VllmRunner diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 8f6de84e5..dcc97ebaa 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from collections import deque from typing import List, Set, Tuple diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py index 16bea5493..a4e3c73a5 100644 --- a/tests/core/test_scheduler_encoder_decoder.py +++ b/tests/core/test_scheduler_encoder_decoder.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest # noqa diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py index d604e5250..64b3e148e 100644 --- a/tests/core/test_serialization.py +++ b/tests/core/test_serialization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import msgspec from vllm.executor.msgspec_utils import decode_hook, encode_hook diff --git a/tests/core/utils.py b/tests/core/utils.py index 16703cd19..fb77dccce 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from collections import defaultdict from typing import Any, Dict, List, Optional diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py index fc4043cd3..72e7ebdb7 100644 --- a/tests/distributed/test_ca_buffer_sharing.py +++ b/tests/distributed/test_ca_buffer_sharing.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # can only run on machines with p2p access across GPUs # can only run with torchrun: # torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index d01f18752..bc916e8de 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test the communication operators. Run `pytest tests/distributed/test_comm_ops.py`. diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 4072616fd..46887bca4 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import random diff --git a/tests/distributed/test_distributed_oot.py b/tests/distributed/test_distributed_oot.py index 62e77a2f7..4b0c65d1d 100644 --- a/tests/distributed/test_distributed_oot.py +++ b/tests/distributed/test_distributed_oot.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from ..entrypoints.openai.test_oot_registration import ( run_and_test_dummy_opt_api_server) diff --git a/tests/distributed/test_multi_node_assignment.py b/tests/distributed/test_multi_node_assignment.py index 9f9c0ff07..c86d2d8a0 100644 --- a/tests/distributed/test_multi_node_assignment.py +++ b/tests/distributed/test_multi_node_assignment.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Make sure ray assigns GPU workers to the correct node. Run: diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index ddbf40f08..5b6741d74 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ WARNING: This test runs in both single-node (4 GPUs) and multi-node (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py index 2d4d07dd2..3ed104820 100644 --- a/tests/distributed/test_pipeline_partition.py +++ b/tests/distributed/test_pipeline_partition.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py index 4912858d8..3bc85b05e 100644 --- a/tests/distributed/test_pp_cudagraph.py +++ b/tests/distributed/test_pp_cudagraph.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index a8571a115..4c42a0ed8 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import multiprocessing import os from typing import Dict, List diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py index 62311a626..9b1bbd6e5 100644 --- a/tests/distributed/test_same_node.py +++ b/tests/distributed/test_same_node.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import torch.distributed as dist diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py index 723872682..59fa7cc9f 100644 --- a/tests/distributed/test_shm_broadcast.py +++ b/tests/distributed/test_shm_broadcast.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import multiprocessing import random import time diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py index 7aa03d7f0..a092a548a 100644 --- a/tests/distributed/test_torchrun_example.py +++ b/tests/distributed/test_torchrun_example.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # unit test for `examples/offline_inference/torchrun_example.py` import random diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 5fb1ae7b2..4432950f2 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import socket import pytest diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index fa5d6a69a..d0e4f8625 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """E2E tests to verify the correctness of the encoder-decoder framework Run `pytest tests/encoder_decoder/test_e2e_correctness.py`. diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/output_processor/test_multi_step.py index 88f3fad4c..3ba3c4ec5 100644 --- a/tests/engine/output_processor/test_multi_step.py +++ b/tests/engine/output_processor/test_multi_step.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from unittest.mock import MagicMock diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py index cc14e8cbf..e9ad8d161 100644 --- a/tests/engine/output_processor/test_stop_checker.py +++ b/tests/engine/output_processor/test_stop_checker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import MagicMock import pytest diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 4e269de9f..8698d124e 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from argparse import ArgumentTypeError import pytest diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py index ed35212cc..dca8fa602 100644 --- a/tests/engine/test_computed_prefix_blocks.py +++ b/tests/engine/test_computed_prefix_blocks.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.engine.arg_utils import EngineArgs diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py index 0e33f3662..3e77faecb 100644 --- a/tests/engine/test_custom_executor.py +++ b/tests/engine/test_custom_executor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os from typing import Any, Callable, Dict, List, Optional, Tuple, Union diff --git a/tests/engine/test_detokenization.py b/tests/engine/test_detokenization.py index f77f6d072..742176ea8 100644 --- a/tests/engine/test_detokenization.py +++ b/tests/engine/test_detokenization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.entrypoints.llm import LLM diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py index 04505fcaa..f1fe58e35 100644 --- a/tests/engine/test_multiproc_workers.py +++ b/tests/engine/test_multiproc_workers.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from concurrent.futures import ThreadPoolExecutor from functools import partial diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py index a6ba7a131..d5111e3fd 100644 --- a/tests/engine/test_short_mm_context.py +++ b/tests/engine/test_short_mm_context.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from ..conftest import IMAGE_ASSETS diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py index b8818af56..655c8232a 100644 --- a/tests/engine/test_skip_tokenizer_init.py +++ b/tests/engine/test_skip_tokenizer_init.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.entrypoints.llm import LLM diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py index b0bd6c4aa..a50b38804 100644 --- a/tests/engine/test_stop_reason.py +++ b/tests/engine/test_stop_reason.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test the different finish_reason="stop" situations during generation: 1. One of the provided stop strings 2. One of the provided stop tokens diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py index 499935620..0f633bb26 100644 --- a/tests/engine/test_stop_strings.py +++ b/tests/engine/test_stop_strings.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, List, Optional import pytest diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py index ef74062ce..b00e168db 100644 --- a/tests/entrypoints/conftest.py +++ b/tests/entrypoints/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py index 6bf7190a6..29ff00df6 100644 --- a/tests/entrypoints/llm/test_accuracy.py +++ b/tests/entrypoints/llm/test_accuracy.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This file test accuracy of the vLLM server via LMEval. It uses local-completions, which interacts with vLLM diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index fc66386fd..77c80b2f8 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py index 22473ce27..39d4810de 100644 --- a/tests/entrypoints/llm/test_collective_rpc.py +++ b/tests/entrypoints/llm/test_collective_rpc.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm import LLM diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index 3906ad766..ebec8baba 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import weakref from typing import List diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index 7d2b37775..4c78c2c8e 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import weakref from typing import List diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py index eb2113692..90e1d5814 100644 --- a/tests/entrypoints/llm/test_generate_multiple_loras.py +++ b/tests/entrypoints/llm/test_generate_multiple_loras.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import weakref import pytest diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py index c2dab300e..c2b4a9358 100644 --- a/tests/entrypoints/llm/test_gpu_utilization.py +++ b/tests/entrypoints/llm/test_gpu_utilization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index ccb9906fc..932a35a99 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import re import weakref diff --git a/tests/entrypoints/llm/test_init.py b/tests/entrypoints/llm/test_init.py index c9a4ad44f..925bf56a9 100644 --- a/tests/entrypoints/llm/test_init.py +++ b/tests/entrypoints/llm/test_init.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm import LLM diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py index bf609b38a..b1f9ae14d 100644 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import sys from contextlib import nullcontext diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py index ee7010a23..f2c145fa3 100644 --- a/tests/entrypoints/llm/test_prompt_validation.py +++ b/tests/entrypoints/llm/test_prompt_validation.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm import LLM diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index 65699e609..eac76f2ba 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for HF_HUB_OFFLINE mode""" import importlib import sys diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py index 4607e4dfe..f7b81be48 100644 --- a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py +++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/entrypoints/openai/reasoning_parsers/utils.py b/tests/entrypoints/openai/reasoning_parsers/utils.py index ac73ad50a..2157e0595 100644 --- a/tests/entrypoints/openai/reasoning_parsers/utils.py +++ b/tests/entrypoints/openai/reasoning_parsers/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple, Union from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, diff --git a/tests/entrypoints/openai/test_accuracy.py b/tests/entrypoints/openai/test_accuracy.py index b1d4461d1..df25780cd 100644 --- a/tests/entrypoints/openai/test_accuracy.py +++ b/tests/entrypoints/openai/test_accuracy.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This file test accuracy of the vLLM server via LMEval. It uses local-completions, which interacts with vLLM diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py index fcce8b46c..1f7ba0da4 100644 --- a/tests/entrypoints/openai/test_async_tokenization.py +++ b/tests/entrypoints/openai/test_async_tokenization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import contextlib import random diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 1116c0da1..6e206dfd9 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List import openai diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index 547c1fd02..0d44a7611 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from http import HTTPStatus from typing import List diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 5e6499d8f..4b5ad55c5 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # imports for guided decoding tests import json import re diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py index 223ac5b41..3e76158a8 100644 --- a/tests/entrypoints/openai/test_chat_echo.py +++ b/tests/entrypoints/openai/test_chat_echo.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import NamedTuple import openai # use the official client for correctness check diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index e1e1dcff7..255aba139 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.entrypoints.chat_utils import (apply_hf_chat_template, diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py index 61d663651..0419395f1 100644 --- a/tests/entrypoints/openai/test_chunked_prompt.py +++ b/tests/entrypoints/openai/test_chunked_prompt.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import openai # use the official client for correctness check import pytest import pytest_asyncio diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index 01bcd78aa..2f065ec10 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import pytest diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 183d900c4..28671cc27 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # imports for guided decoding tests import json import re diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index b52a5b28c..e86ea87dd 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import base64 import numpy as np diff --git a/tests/entrypoints/openai/test_encoder_decoder.py b/tests/entrypoints/openai/test_encoder_decoder.py index 51eba694e..52b4df9ce 100644 --- a/tests/entrypoints/openai/test_encoder_decoder.py +++ b/tests/entrypoints/openai/test_encoder_decoder.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import openai import pytest import pytest_asyncio diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index 6ff99f6fa..1a62157ac 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import json import shutil diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 941f46571..a9134be62 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import subprocess import sys import tempfile diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py index ae5bf404d..3d4f1cde2 100644 --- a/tests/entrypoints/openai/test_models.py +++ b/tests/entrypoints/openai/test_models.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import openai # use the official client for correctness check import pytest import pytest_asyncio diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py index b25cb1d0e..a1b7a205a 100644 --- a/tests/entrypoints/openai/test_oot_registration.py +++ b/tests/entrypoints/openai/test_oot_registration.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from ...utils import VLLM_PATH, RemoteOpenAIServer chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja" diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py index 9c4923939..11d3bfafa 100644 --- a/tests/entrypoints/openai/test_pooling.py +++ b/tests/entrypoints/openai/test_pooling.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import base64 import numpy as np diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index 1ae64ef49..64a1eb6a6 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # imports for guided decoding tests import re diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py index cfd8f3313..4c9774a73 100644 --- a/tests/entrypoints/openai/test_rerank.py +++ b/tests/entrypoints/openai/test_rerank.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import requests diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py index 99f6da160..9b33eddae 100644 --- a/tests/entrypoints/openai/test_return_tokens_as_ids.py +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Separate these tests out from test_completion and test_chat, because they # require launching a second server with a different flag. Running both servers # at the same time on a single node will OOM. diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py index 20f796061..ad8159afc 100644 --- a/tests/entrypoints/openai/test_root_path.py +++ b/tests/entrypoints/openai/test_root_path.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import contextlib import os from typing import Any, List, NamedTuple diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index 1f8a56bb4..db049ee2b 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import subprocess import sys diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index 0d19615bc..bcbcb5702 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import requests diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index e88d6c3c6..1e7dbaf60 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from contextlib import suppress from dataclasses import dataclass diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index 657ea2021..70ca8507a 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from http import HTTPStatus from unittest.mock import MagicMock diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py index 090523a83..5edf85ab5 100644 --- a/tests/entrypoints/openai/test_shutdown.py +++ b/tests/entrypoints/openai/test_shutdown.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import openai import pytest diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index b1956a8cb..663b72242 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import pytest_asyncio import requests diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index e73449e40..ab9285407 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List import openai diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 5f070ba3b..029c9b038 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List import openai diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index c851539c6..f2ff4a0b0 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict import pytest diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py index 47b0b6bb8..788efa86b 100644 --- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List from unittest.mock import MagicMock diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py index f0a2a32c1..57ec98653 100644 --- a/tests/entrypoints/openai/tool_parsers/utils.py +++ b/tests/entrypoints/openai/tool_parsers/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Iterable, List, Tuple, Union from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 513b466c1..737f73309 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import warnings from typing import Optional diff --git a/tests/kernels/allclose_default.py b/tests/kernels/allclose_default.py index 175cfe82f..97ceffab4 100644 --- a/tests/kernels/allclose_default.py +++ b/tests/kernels/allclose_default.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch # Reference default values of atol and rtol are from diff --git a/tests/kernels/conftest.py b/tests/kernels/conftest.py index 4f2f9cc3d..4f04ec947 100644 --- a/tests/kernels/conftest.py +++ b/tests/kernels/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.utils import (create_kv_caches_with_random, diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index f2358940f..34dcf91c7 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Tuple, Union import torch diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index dac26efe8..2e70b1db3 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import Type diff --git a/tests/kernels/test_aqlm.py b/tests/kernels/test_aqlm.py index 860fb66b1..7d3617281 100644 --- a/tests/kernels/test_aqlm.py +++ b/tests/kernels/test_aqlm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from tests.kernels.utils import opcheck diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 574a0f223..b667d8d9e 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import List, Optional, Tuple diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 492acb91e..0e8743731 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import Mock, patch import pytest diff --git a/tests/kernels/test_awq.py b/tests/kernels/test_awq.py index aa7a43085..ace75a336 100644 --- a/tests/kernels/test_awq.py +++ b/tests/kernels/test_awq.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py index 238d6426b..67595010c 100644 --- a/tests/kernels/test_awq_marlin.py +++ b/tests/kernels/test_awq_marlin.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test AWQ with fused MoE Marlin kernels. Run `pytest tests/kernels/test_awq_marlin.py`. diff --git a/tests/kernels/test_awq_triton.py b/tests/kernels/test_awq_triton.py index 406a0c8dd..3fc3feaf4 100644 --- a/tests/kernels/test_awq_triton.py +++ b/tests/kernels/test_awq_triton.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for the AWQ Triton kernel. Run `pytest tests/kernels/test_awq_triton.py`. diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py index f28fdf3fe..20eff1c20 100644 --- a/tests/kernels/test_block_fp8.py +++ b/tests/kernels/test_block_fp8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from https://github.com/sgl-project/sglang/pull/2575 import itertools diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index 08f31219e..e653d34d0 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import List, Optional, Tuple diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index c848be4f9..6f909b680 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import List, Tuple diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py index 8edfde42e..8cc1a6a1b 100755 --- a/tests/kernels/test_cascade_flash_attn.py +++ b/tests/kernels/test_cascade_flash_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple import pytest diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/test_causal_conv1d.py index 51be2425d..93064e23d 100644 --- a/tests/kernels/test_causal_conv1d.py +++ b/tests/kernels/test_causal_conv1d.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional import pytest diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index f538d492c..49fd8ed63 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for cutlass kernels Run `pytest tests/kernels/test_cutlass.py`. diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/test_cutlass_2of4_sparse.py index 56495df34..4c613b75f 100644 --- a/tests/kernels/test_cutlass_2of4_sparse.py +++ b/tests/kernels/test_cutlass_2of4_sparse.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for sparse cutlass kernels Run `pytest tests/kernels/test_semi_structured.py`. diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index e008a56de..0d11e8652 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Tests: diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index 0ee0bf6c6..b8af89b66 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple import pytest diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py index 1645ef911..212ceb5e4 100644 --- a/tests/kernels/test_flashinfer.py +++ b/tests/kernels/test_flashinfer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple import flashinfer diff --git a/tests/kernels/test_fp8_quant.py b/tests/kernels/test_fp8_quant.py index ebaaae232..876cf03fd 100644 --- a/tests/kernels/test_fp8_quant.py +++ b/tests/kernels/test_fp8_quant.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/test_fused_quant_layernorm.py index baf8d73fd..d4b674b23 100644 --- a/tests/kernels/test_fused_quant_layernorm.py +++ b/tests/kernels/test_fused_quant_layernorm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Tuple, Union import pytest diff --git a/tests/kernels/test_ggml.py b/tests/kernels/test_ggml.py index dddb285bf..dc728fd48 100644 --- a/tests/kernels/test_ggml.py +++ b/tests/kernels/test_ggml.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import gguf import pytest import torch diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py index 893af99ba..847ca9f43 100644 --- a/tests/kernels/test_gguf.py +++ b/tests/kernels/test_gguf.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from pathlib import Path from typing import List diff --git a/tests/kernels/test_gptq.py b/tests/kernels/test_gptq.py index c1ca6f1f5..fea013d9e 100644 --- a/tests/kernels/test_gptq.py +++ b/tests/kernels/test_gptq.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from tests.kernels.utils import opcheck diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py index 761eb95c4..25dcb587e 100644 --- a/tests/kernels/test_int8_quant.py +++ b/tests/kernels/test_int8_quant.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index 727769e07..fa4bbe458 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch diff --git a/tests/kernels/test_machete_mm.py b/tests/kernels/test_machete_mm.py index 1c6eb2dd9..bd60526ed 100644 --- a/tests/kernels/test_machete_mm.py +++ b/tests/kernels/test_machete_mm.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for the machete kernel. Run `pytest tests/kernels/test_machete_mm.py`. diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/test_mamba_ssm.py index 19d1158c7..84d4c347e 100644 --- a/tests/kernels/test_mamba_ssm.py +++ b/tests/kernels/test_mamba_ssm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch import torch.nn.functional as F diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py index 5e047f4b0..b96aca06c 100644 --- a/tests/kernels/test_marlin_gemm.py +++ b/tests/kernels/test_marlin_gemm.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for the marlin kernel. Run `pytest tests/kernels/marlin/test_marlin_gemm.py`. diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/test_mha_attn.py index eab874e9e..5a18b7916 100644 --- a/tests/kernels/test_mha_attn.py +++ b/tests/kernels/test_mha_attn.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Test: diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 7aa248ed1..0f13fbc96 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for the MOE layers. Run `pytest tests/kernels/test_moe.py`. diff --git a/tests/kernels/test_permute_cols.py b/tests/kernels/test_permute_cols.py index 14ad7a22c..35d62079f 100644 --- a/tests/kernels/test_permute_cols.py +++ b/tests/kernels/test_permute_cols.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index eee77c22a..5b7b0fda2 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from itertools import accumulate, product from typing import Dict, List, Optional diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 10e73ab95..2184c9852 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math import random import time diff --git a/tests/kernels/test_rotary_embedding.py b/tests/kernels/test_rotary_embedding.py index da879406b..362bcb35c 100644 --- a/tests/kernels/test_rotary_embedding.py +++ b/tests/kernels/test_rotary_embedding.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Tests for miscellaneous utilities """ diff --git a/tests/kernels/test_triton_decode_attention.py b/tests/kernels/test_triton_decode_attention.py index 14f5a3b77..fd3c9fa41 100644 --- a/tests/kernels/test_triton_decode_attention.py +++ b/tests/kernels/test_triton_decode_attention.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/test_triton_scaled_mm.py index a5aab3c2e..d878ed6f4 100644 --- a/tests/kernels/test_triton_scaled_mm.py +++ b/tests/kernels/test_triton_scaled_mm.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for the triton_scaled_mm kernel Run `pytest tests/kernels/test_triton_scaled_mm.py`. diff --git a/tests/kernels/test_utils.py b/tests/kernels/test_utils.py index 7e5126a76..d3f032002 100644 --- a/tests/kernels/test_utils.py +++ b/tests/kernels/test_utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Tests for miscellaneous utilities """ diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index c735c5edd..5be111d71 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Kernel test utils""" import itertools diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py index adc6150ed..97e0d6eb1 100644 --- a/tests/kv_transfer/disagg_test.py +++ b/tests/kv_transfer/disagg_test.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import subprocess import sys diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/module_test.py index 355461919..8a6490b5c 100644 --- a/tests/kv_transfer/module_test.py +++ b/tests/kv_transfer/module_test.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import subprocess import sys diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py index 4d6890305..c5b34660d 100644 --- a/tests/kv_transfer/test_lookup_buffer.py +++ b/tests/kv_transfer/test_lookup_buffer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import random diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py index 1cc1ced99..181a5ac20 100644 --- a/tests/kv_transfer/test_send_recv.py +++ b/tests/kv_transfer/test_send_recv.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import time from typing import List diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index e7378d007..071cdbecc 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import tempfile from collections import OrderedDict from typing import Dict, List, TypedDict diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py index 61b8899f0..2d33f738b 100644 --- a/tests/lora/data/long_context_test_data.py +++ b/tests/lora/data/long_context_test_data.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # ruff: noqa """This file contains a dictionary of prompts and golden responses.""" diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 0ba2ce361..249f7619d 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index 49a527b99..0aa9fe7a9 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import vllm diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index 5ae705e47..8923aa221 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/lora/test_jamba.py b/tests/lora/test_jamba.py index 6aa33926c..c04174665 100644 --- a/tests/lora/test_jamba.py +++ b/tests/lora/test_jamba.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 08a589d7e..0838ca02c 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from copy import deepcopy from dataclasses import dataclass diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index dfeac3809..39f779f40 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import ray diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index e7a34f2ce..62005de73 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import ast from typing import List, Optional, Tuple diff --git a/tests/lora/test_lora_bias_e2e.py b/tests/lora/test_lora_bias_e2e.py index c2520c847..cbdd68831 100644 --- a/tests/lora/test_lora_bias_e2e.py +++ b/tests/lora/test_lora_bias_e2e.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index b907af47d..d2a4b901b 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 1c0ee01c0..273fe9ae0 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 9a5b9aabf..6666f54fd 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import Dict, List diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index 3b0f18325..2e81bb326 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 940a86522..90cf8fd39 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py index a524d5ce5..9935472ad 100644 --- a/tests/lora/test_peft_helper.py +++ b/tests/lora/test_peft_helper.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import math import shutil diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index 5a3fcb8d6..651c89ffc 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import vllm diff --git a/tests/lora/test_punica_ops_sizes.py b/tests/lora/test_punica_ops_sizes.py index 433ca7577..ecd3bc497 100644 --- a/tests/lora/test_punica_ops_sizes.py +++ b/tests/lora/test_punica_ops_sizes.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This script is mainly used to tests various hidden_sizes. We have collected the hidden_sizes included in the LoRA models currently supported by vLLM. It tests diff --git a/tests/lora/test_punica_ops_variation.py b/tests/lora/test_punica_ops_variation.py index 2bb84c1cf..6d1d3c943 100644 --- a/tests/lora/test_punica_ops_variation.py +++ b/tests/lora/test_punica_ops_variation.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This script is mainly used to test whether trtion kernels can run normally under different conditions, including various batches, numbers of LoRA , and diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 26bf770cc..5702aa26b 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py from dataclasses import dataclass diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 570aa3861..a988f06ab 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py index d225a3f7d..589167e80 100644 --- a/tests/lora/test_tokenizer_group.py +++ b/tests/lora/test_tokenizer_group.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from transformers import AutoTokenizer, PreTrainedTokenizerBase diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py index 85110b8fa..34a26e9ed 100644 --- a/tests/lora/test_utils.py +++ b/tests/lora/test_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections import OrderedDict from unittest.mock import patch diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 9d814f657..797141ea3 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import random import tempfile diff --git a/tests/lora/utils.py b/tests/lora/utils.py index ce47546f2..bda00e081 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List, Optional import torch diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index b3c785055..0942c8eed 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from typing import List diff --git a/tests/model_executor/conftest.py b/tests/model_executor/conftest.py index 10792b0a0..b588a1a96 100644 --- a/tests/model_executor/conftest.py +++ b/tests/model_executor/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 0a3aba255..2c6780848 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py index be5282d9c..64d0928f8 100644 --- a/tests/model_executor/test_guided_processors.py +++ b/tests/model_executor/test_guided_processors.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pickle import pytest diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 9c1f784c1..760a11993 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/weight_utils.py index c8b9bed69..11dfe4d49 100644 --- a/tests/model_executor/weight_utils.py +++ b/tests/model_executor/weight_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import tempfile diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index 1e329dc4c..fe9361d12 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple, Type import numpy as np diff --git a/tests/models/decoder_only/language/test_aqlm.py b/tests/models/decoder_only/language/test_aqlm.py index a8cb5bbf9..85557b30d 100644 --- a/tests/models/decoder_only/language/test_aqlm.py +++ b/tests/models/decoder_only/language/test_aqlm.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of a AQLM model between vLLM and HF Transformers Run `pytest tests/models/test_aqlm.py`. diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py index 5f06f1e3a..6a0e148d5 100644 --- a/tests/models/decoder_only/language/test_fp8.py +++ b/tests/models/decoder_only/language/test_fp8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # flake8: noqa """Tests fp8 models against ground truth generation Note: these tests will only pass on L4 GPU. diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py index ad8f8a0c3..57fe1d5b1 100644 --- a/tests/models/decoder_only/language/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Tests gguf models against unquantized models generations Note: To pass the test, quantization higher than Q4 should be used diff --git a/tests/models/decoder_only/language/test_gptq_marlin.py b/tests/models/decoder_only/language/test_gptq_marlin.py index 037411a18..0f61466c3 100644 --- a/tests/models/decoder_only/language/test_gptq_marlin.py +++ b/tests/models/decoder_only/language/test_gptq_marlin.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compares the outputs of gptq vs gptq_marlin Note: GPTQ and Marlin do not have bitwise correctness. As a result, in this test, we just confirm that the top selected tokens of the diff --git a/tests/models/decoder_only/language/test_gptq_marlin_24.py b/tests/models/decoder_only/language/test_gptq_marlin_24.py index 26cb3ec31..c81626148 100644 --- a/tests/models/decoder_only/language/test_gptq_marlin_24.py +++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of a GPTQ model to a Marlin_24 model. Note: GPTQ and Marlin_24 do not have bitwise correctness. diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py index 5e93842f4..119b79d64 100644 --- a/tests/models/decoder_only/language/test_granite.py +++ b/tests/models/decoder_only/language/test_granite.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM for Granite models using greedy sampling. Run `pytest tests/models/test_granite.py`. diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py index 2e06b10fb..cc98f1d7b 100644 --- a/tests/models/decoder_only/language/test_jamba.py +++ b/tests/models/decoder_only/language/test_jamba.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from tests.utils import multi_gpu_test diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index 1ad4f5aae..854f4fe4f 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM when using greedy sampling for Mamba. Run `pytest tests/models/test_mamba.py`. diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py index bdc157178..179236730 100644 --- a/tests/models/decoder_only/language/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM for Mistral models using greedy sampling. Run `pytest tests/models/test_mistral.py`. diff --git a/tests/models/decoder_only/language/test_modelopt.py b/tests/models/decoder_only/language/test_modelopt.py index 077e50e3a..66dd97957 100644 --- a/tests/models/decoder_only/language/test_modelopt.py +++ b/tests/models/decoder_only/language/test_modelopt.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # flake8: noqa """Tests Model Optimizer fp8 models against ground truth generation Note: these tests will only pass on H100 diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index c7efa4edb..1ad562415 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM when using greedy sampling. Run `pytest tests/models/test_models.py`. diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/decoder_only/language/test_phimoe.py index c997359a2..f9757d6ac 100644 --- a/tests/models/decoder_only/language/test_phimoe.py +++ b/tests/models/decoder_only/language/test_phimoe.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM for moe models using greedy sampling. Run `pytest tests/models/test_phimoe.py`. diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py index 18ceb34a4..31a5cd260 100644 --- a/tests/models/decoder_only/vision_language/test_awq.py +++ b/tests/models/decoder_only/vision_language/test_awq.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Type import pytest diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py index 7406df253..9590adf6f 100644 --- a/tests/models/decoder_only/vision_language/test_h2ovl.py +++ b/tests/models/decoder_only/vision_language/test_h2ovl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Tuple import pytest diff --git a/tests/models/decoder_only/vision_language/test_intern_vit.py b/tests/models/decoder_only/vision_language/test_intern_vit.py index 32fcb0bbc..a842d14fe 100644 --- a/tests/models/decoder_only/vision_language/test_intern_vit.py +++ b/tests/models/decoder_only/vision_language/test_intern_vit.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional import pytest diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 62c644f73..e3cda8971 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Common tests for testing .generate() functionality for single / multiple image, embedding, and video support for different VLMs in vLLM. """ diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py index 3a8934adf..dd68fe4cd 100644 --- a/tests/models/decoder_only/vision_language/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/test_phi3v.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import re from typing import List, Optional, Tuple, Type diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py index 8103e5305..602da2b5f 100644 --- a/tests/models/decoder_only/vision_language/test_pixtral.py +++ b/tests/models/decoder_only/vision_language/test_pixtral.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM for Mistral models using greedy sampling. Run `pytest tests/models/test_mistral.py`. diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index 5a485f3d8..de240a904 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, List, Optional, Tuple, Type, TypedDict, Union import numpy.typing as npt diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py index 59773be70..539410d18 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Helpers for building inputs that can be leveraged for different test types. """ from pathlib import PosixPath diff --git a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py index 9bb713416..ca4ec2141 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Utils for determining which subset of model tests belong to a specific modality, getting all combinations (similar to pytest's parametrization), handling multimodal placeholder substitution, and so on. diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py index 54b7b0733..0aed26769 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/core.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Core test implementation to be shared across modalities.""" from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union @@ -153,4 +154,4 @@ def process_runner_outputs( def process_outputs(output_processor, model, outputs_per_image): """Applies a model specific post-processor function to a runner's output""" return [[output_processor(res, model) for res in outputs] - for outputs in outputs_per_image] \ No newline at end of file + for outputs in outputs_per_image] diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py index 2291f4fa0..2f03a114a 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Custom input builders for edge-cases in different models.""" from typing import Callable diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 07bdb2cee..b0a88161c 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Common utility functions relating to different models that are useful for manipulating the input / output of HF & vLLM test runners, which are typically specific to a small subset of models. diff --git a/tests/models/decoder_only/vision_language/vlm_utils/runners.py b/tests/models/decoder_only/vision_language/vlm_utils/runners.py index 2d3b39fe3..fb9df37ca 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/runners.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Entrypoints for wrapping the core run_test implementation for specific test types / modalities. """ diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py index e2e0c6390..ae3b9d59b 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/types.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Types for writing multimodal model tests.""" from enum import Enum from pathlib import PosixPath diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py index 0cbe4afe9..b0420ff5c 100644 --- a/tests/models/embedding/language/test_cls_models.py +++ b/tests/models/embedding/language/test_cls_models.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the classification outputs of HF and vLLM models. Run `pytest tests/models/test_cls_models.py`. diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index e17198e38..ad6385376 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the embedding outputs of HF and vLLM models. Run `pytest tests/models/embedding/language/test_embedding.py`. diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py index 55c2e5d4e..7ed2fb8a6 100644 --- a/tests/models/embedding/language/test_gritlm.py +++ b/tests/models/embedding/language/test_gritlm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import importlib.util import math from array import array diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py index 3db27d942..d6408258f 100644 --- a/tests/models/embedding/language/test_scoring.py +++ b/tests/models/embedding/language/test_scoring.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the scoring outputs of HF and vLLM models. Run `pytest tests/models/embedding/language/test_scoring.py`. diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py index f96c7d2b1..567aa5098 100644 --- a/tests/models/embedding/utils.py +++ b/tests/models/embedding/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Sequence import torch diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py index 2641987b2..82f2bf531 100644 --- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py +++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import partial from typing import Callable, Dict, List, Type diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py index f4cd8b81a..6ba3c5403 100644 --- a/tests/models/embedding/vision_language/test_llava_next.py +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Type import pytest diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py index 9374c23dd..0cb948746 100644 --- a/tests/models/embedding/vision_language/test_phi3v.py +++ b/tests/models/embedding/vision_language/test_phi3v.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Type import pytest diff --git a/tests/models/encoder_decoder/audio_language/test_whisper.py b/tests/models/encoder_decoder/audio_language/test_whisper.py index eb238c533..80d6897da 100644 --- a/tests/models/encoder_decoder/audio_language/test_whisper.py +++ b/tests/models/encoder_decoder/audio_language/test_whisper.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM for Whisper models using greedy sampling. Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`. diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/encoder_decoder/language/test_bart.py index 10aba8427..81b629fdc 100644 --- a/tests/models/encoder_decoder/language/test_bart.py +++ b/tests/models/encoder_decoder/language/test_bart.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM for BART models using greedy sampling. Run `pytest tests/models/encoder_decoder/language/test_bart.py`. diff --git a/tests/models/encoder_decoder/vision_language/test_broadcast.py b/tests/models/encoder_decoder/vision_language/test_broadcast.py index 542f41a38..8d986414e 100644 --- a/tests/models/encoder_decoder/vision_language/test_broadcast.py +++ b/tests/models/encoder_decoder/vision_language/test_broadcast.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from ....utils import multi_gpu_test diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py index d686f1da3..a1d156799 100644 --- a/tests/models/encoder_decoder/vision_language/test_florence2.py +++ b/tests/models/encoder_decoder/vision_language/test_florence2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import partial from typing import List, Optional, Tuple, Type diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index 16c71228e..4cd2dbdb4 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple, Type, overload import pytest diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index ca28da268..3921d4e19 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import partial import numpy as np diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py index 69b91ad4a..00c1dae51 100644 --- a/tests/models/multimodal/processing/test_idefics3.py +++ b/tests/models/multimodal/processing/test_idefics3.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for Idefics3's multimodal preprocessing kwargs.""" from typing import Optional diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index d6c60595c..0d921e9d3 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for InternVL's multimodal preprocessing kwargs.""" from typing import Callable, Optional diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py index 6de649f87..d2497e62d 100644 --- a/tests/models/multimodal/processing/test_llava_next.py +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools from functools import partial diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py index 806437d35..bd4dbd46d 100644 --- a/tests/models/multimodal/processing/test_llava_onevision.py +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools from functools import partial diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py index 7f82a8f18..44edec457 100644 --- a/tests/models/multimodal/processing/test_phi3v.py +++ b/tests/models/multimodal/processing/test_phi3v.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for phi3v's multimodal preprocessing kwargs.""" import pytest diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index de14fbbff..47c9b0add 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.multimodal import MULTIMODAL_REGISTRY diff --git a/tests/models/registry.py b/tests/models/registry.py index 7952e65aa..d0dbbf00e 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass, field from typing import AbstractSet, Any, Literal, Mapping, Optional diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index d3a3aaf67..64928a65d 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import patch import pytest diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index 2c413a633..ef665baa1 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index ac0366847..80d3f78f9 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import warnings import pytest diff --git a/tests/models/utils.py b/tests/models/utils.py index 0eb3f61f1..e2be43c12 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import warnings from typing import Dict, List, Optional, Sequence, Tuple, Union diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py index 782b508a5..808346b5e 100644 --- a/tests/mq_llm_engine/test_abort.py +++ b/tests/mq_llm_engine/test_abort.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test that aborting is handled properly.""" import asyncio diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py index 83bc4e7cf..35d001781 100644 --- a/tests/mq_llm_engine/test_error_handling.py +++ b/tests/mq_llm_engine/test_error_handling.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test that various errors are handled properly.""" import asyncio diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py index 630c112d0..2069ff987 100644 --- a/tests/mq_llm_engine/test_load.py +++ b/tests/mq_llm_engine/test_load.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test that the MQLLMEngine is able to handle 10k concurrent requests.""" import asyncio diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py index f717c1355..11e44f12b 100644 --- a/tests/mq_llm_engine/utils.py +++ b/tests/mq_llm_engine/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import multiprocessing from typing import Callable, Tuple, Union diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index b8524ed83..9822cee14 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Test the AsyncLLMEngine with multi-step-decoding from typing import List, Optional diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index 34030d9d6..29d5ffd4c 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Test the LLMEngine with multi-step-decoding import copy diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py index 678bbb52b..f5d3e282f 100644 --- a/tests/multimodal/test_inputs.py +++ b/tests/multimodal/test_inputs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 13f820d01..6cccd2aa2 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from contextlib import nullcontext from typing import cast from unittest.mock import MagicMock diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py index d141cdf1f..5d18b2ed7 100644 --- a/tests/multimodal/test_processor_kwargs.py +++ b/tests/multimodal/test_processor_kwargs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from array import array from typing import Callable, Dict, Mapping, Optional from unittest.mock import patch diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 198344e5b..f9e0f507a 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import base64 import mimetypes import os diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py index 29aeca605..9a336b7e6 100644 --- a/tests/multimodal/utils.py +++ b/tests/multimodal/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import numpy as np from PIL import Image diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/test_prefix_prefill.py index 77b707a73..dfbcfc15e 100644 --- a/tests/neuron/test_prefix_prefill.py +++ b/tests/neuron/test_prefix_prefill.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import Optional diff --git a/tests/plugins/vllm_add_dummy_model/setup.py b/tests/plugins/vllm_add_dummy_model/setup.py index 9b535127f..e3fb6efb2 100644 --- a/tests/plugins/vllm_add_dummy_model/setup.py +++ b/tests/plugins/vllm_add_dummy_model/setup.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from setuptools import setup setup(name='vllm_add_dummy_model', diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py index 62a8f871f..0c431cb39 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import ModelRegistry diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py index 5e7d7d187..3af62b288 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Iterable, List, Optional, Tuple, Union import torch diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py index ac64edfd4..c23ab6430 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional import torch diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py index 569ef216c..bbd11ed4a 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional import torch diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py index 316399068..10df0b5e0 100644 --- a/tests/plugins/vllm_add_dummy_platform/setup.py +++ b/tests/plugins/vllm_add_dummy_platform/setup.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from setuptools import setup setup( diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py index 594cef520..0d1b062ac 100644 --- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py index 5634be3c8..33425bbc1 100644 --- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.attention.backends.flash_attn import FlashAttentionBackend diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py index d7c6bdd70..5cefafc7e 100644 --- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.platforms.cuda import CudaPlatform diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index 661aa5f64..ed50fe535 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from tests.kernels.utils import override_backend_env_variable diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py index 5a28943b7..19f393e07 100644 --- a/tests/prefix_caching/test_disable_sliding_window.py +++ b/tests/prefix_caching/test_disable_sliding_window.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the with and without prefix caching. Run `pytest tests/prefix_caching/test_prefix_caching.py`. diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 8d16710f1..90d424fe3 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the with and without prefix caching. Run `pytest tests/prefix_caching/test_prefix_caching.py`. diff --git a/tests/prompt_adapter/test_bloom.py b/tests/prompt_adapter/test_bloom.py index 6528b3009..a31d8e873 100644 --- a/tests/prompt_adapter/test_bloom.py +++ b/tests/prompt_adapter/test_bloom.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import vllm diff --git a/tests/prompt_adapter/test_multi_adapter_inference.py b/tests/prompt_adapter/test_multi_adapter_inference.py index 39a79becd..e249a6e64 100644 --- a/tests/prompt_adapter/test_multi_adapter_inference.py +++ b/tests/prompt_adapter/test_multi_adapter_inference.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import EngineArgs, LLMEngine, SamplingParams from vllm.prompt_adapter.request import PromptAdapterRequest diff --git a/tests/prompt_adapter/test_pa_lora.py b/tests/prompt_adapter/test_pa_lora.py index 2a5f23f7f..fb4c3e149 100644 --- a/tests/prompt_adapter/test_pa_lora.py +++ b/tests/prompt_adapter/test_pa_lora.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from huggingface_hub import snapshot_download from vllm import EngineArgs, LLMEngine, SamplingParams diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index 569fc8dfb..4b5210cdf 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 '''Tests whether bitsandbytes computation is enabled correctly. Run `pytest tests/quantization/test_bitsandbytes.py`. diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 1072697ec..7e2e6f6ed 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test model set-up and weight loading for llmcompressor-quantized models. Run `pytest tests/quantization/test_compressed_tensors.py`. diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index cf77ccec7..0abbd8ebb 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests whether Marlin models can be loaded from the autogptq config. Run `pytest tests/quantization/test_configs.py --forked`. diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py index 21ce5174c..29a5721ef 100644 --- a/tests/quantization/test_cpu_offload.py +++ b/tests/quantization/test_cpu_offload.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Expanded quantized model tests for CPU offloading # Base tests: tests/basic_correctness/test_cpu_offload.py diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py index ec31c94ef..b6db6d5f2 100644 --- a/tests/quantization/test_experts_int8.py +++ b/tests/quantization/test_experts_int8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # flake8: noqa """Tests experts_int8 quantization startup and generation, doesn't test correctness diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 4bff73474..5616935eb 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests whether FP8 computation is enabled correctly. Run `pytest tests/quantization/test_fp8.py --forked`. diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py index 68a73f0f8..0e3913676 100644 --- a/tests/quantization/test_ipex_quant.py +++ b/tests/quantization/test_ipex_quant.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test model set-up and inference for quantized HF models supported on the CPU/GPU backend using IPEX (including AWQ/GPTQ). diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index fa2d9645e..ec60d8a57 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests whether gptq models with quantized lm_head can be loaded. Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`. diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index 11382ad70..491370c7c 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test model set-up and weight loading for quark-quantized models. Run `pytest tests/quantization/test_quark.py`. diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index 8e7f44a39..9e1867f91 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests register custom quantization config. See https://github.com/vllm-project/vllm/issues/11926 for more details. diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py index 8ebd8dd2b..7a339c162 100644 --- a/tests/quantization/utils.py +++ b/tests/quantization/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.model_executor.layers.quantization import get_quantization_config from vllm.platforms import current_platform diff --git a/tests/runai_model_streamer/test_runai_model_streamer_loader.py b/tests/runai_model_streamer/test_runai_model_streamer_loader.py index c5722fbae..aa91fa8e1 100644 --- a/tests/runai_model_streamer/test_runai_model_streamer_loader.py +++ b/tests/runai_model_streamer/test_runai_model_streamer_loader.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import SamplingParams from vllm.config import LoadConfig, LoadFormat from vllm.model_executor.model_loader.loader import (RunaiModelStreamerLoader, diff --git a/tests/runai_model_streamer/test_weight_utils.py b/tests/runai_model_streamer/test_weight_utils.py index 5c89bd78a..4afa76c51 100644 --- a/tests/runai_model_streamer/test_weight_utils.py +++ b/tests/runai_model_streamer/test_weight_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import glob import tempfile diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index 4d1a6978d..39feb1895 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM when using beam search. Run `pytest tests/samplers/test_beam_search.py`. diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index dc2482d85..7f26698c9 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Make sure ignore_eos works. Run `pytest tests/samplers/test_ignore_eos.py`. diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py index 297947012..3b95b0389 100644 --- a/tests/samplers/test_logits_processor.py +++ b/tests/samplers/test_logits_processor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index c07c71e38..59d36099c 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py index 4190cf7cd..cc6557694 100644 --- a/tests/samplers/test_no_bad_words.py +++ b/tests/samplers/test_no_bad_words.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Make sure bad_words works. Run `pytest tests/samplers/test_no_bad_words.py`. diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py index ed2fee1ae..c74c1c02c 100644 --- a/tests/samplers/test_ranks.py +++ b/tests/samplers/test_ranks.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm import SamplingParams diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index dcb1b27bf..cc199bf68 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for rejection sampling.""" from typing import List, Tuple diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 28c34064f..ca09e536a 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools import random from dataclasses import dataclass diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index bf1ee6c39..4e8282561 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Verify that seeded random sampling is deterministic. Run `pytest tests/samplers/test_seeded_generate.py`. diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py index 4ddad66dc..ecf98179c 100644 --- a/tests/samplers/test_typical_acceptance_sampler.py +++ b/tests/samplers/test_typical_acceptance_sampler.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for rejection sampling.""" import pytest diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index 5cb982a08..53c888816 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from itertools import cycle from typing import List, Optional, Sequence, Tuple, Union diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py index af8397c23..14a0ebf1d 100644 --- a/tests/spec_decode/e2e/test_compatibility.py +++ b/tests/spec_decode/e2e/test_compatibility.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm import SamplingParams diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py index 5bc70de9d..6d1803f8b 100644 --- a/tests/spec_decode/e2e/test_eagle_correctness.py +++ b/tests/spec_decode/e2e/test_eagle_correctness.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py index b89e58497..c67fa8514 100644 --- a/tests/spec_decode/e2e/test_integration.py +++ b/tests/spec_decode/e2e/test_integration.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests which cover integration of the speculative decoding framework with other features, e.g. cuda graphs. """ diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py index 7001ee4c0..e5a542b6d 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp2.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests which cover integration of the speculative decoding framework with tensor parallelism. """ diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py index 2cb10de1c..cb9c46dc7 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp4.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests which cover integration of the speculative decoding framework with tensor parallelism. """ diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py index 1a543606c..5991a8b02 100644 --- a/tests/spec_decode/e2e/test_logprobs.py +++ b/tests/spec_decode/e2e/test_logprobs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from itertools import cycle import pytest diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py index dbcbc0db1..807f41cc9 100644 --- a/tests/spec_decode/e2e/test_medusa_correctness.py +++ b/tests/spec_decode/e2e/test_medusa_correctness.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py index 1fa1104f5..a2b84b902 100644 --- a/tests/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/spec_decode/e2e/test_mlp_correctness.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index 05ad468dd..d396e52a9 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """The tests in this file verify end-to-end speculative decoding correctness. This docstring details important information on the testing methodology. diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py index 77f8b8998..1aff53cb5 100644 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/spec_decode/e2e/test_ngram_correctness.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py index e42cf416b..b7d279f29 100644 --- a/tests/spec_decode/e2e/test_seed.py +++ b/tests/spec_decode/e2e/test_seed.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from .conftest import run_equality_correctness_test diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 3504fcf43..fe95ff9b9 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py index aa49a3aee..0bff0ea1d 100644 --- a/tests/spec_decode/test_dynamic_spec_decode.py +++ b/tests/spec_decode/test_dynamic_spec_decode.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import MagicMock, patch import pytest diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py index 7477486a3..1a6693e16 100644 --- a/tests/spec_decode/test_metrics.py +++ b/tests/spec_decode/test_metrics.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from unittest.mock import MagicMock diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index 0b5d82b66..2bf401613 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import Dict, List from unittest.mock import MagicMock diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py index f66e95718..7de54b3ed 100644 --- a/tests/spec_decode/test_ngram_worker.py +++ b/tests/spec_decode/test_ngram_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from vllm.sequence import ExecuteModelRequest diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py index 5a093dea1..7bbbb0236 100644 --- a/tests/spec_decode/test_scorer.py +++ b/tests/spec_decode/test_scorer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import List diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index d8c3af4c1..eee0f4c89 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from collections import defaultdict from types import SimpleNamespace diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py index 195fce648..24573e224 100644 --- a/tests/spec_decode/test_utils.py +++ b/tests/spec_decode/test_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import MagicMock import pytest diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 2f883c2ff..38f57e99b 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from itertools import count from typing import Callable, Dict, List, Optional from typing import Sequence as GenericSequence diff --git a/tests/standalone_tests/lazy_torch_compile.py b/tests/standalone_tests/lazy_torch_compile.py index b950877a4..b3b580952 100644 --- a/tests/standalone_tests/lazy_torch_compile.py +++ b/tests/standalone_tests/lazy_torch_compile.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Description: Test the lazy import module # The utility function cannot be placed in `vllm.utils` # this needs to be a standalone script diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py index 2a4565362..694bb5fbc 100644 --- a/tests/tensorizer_loader/conftest.py +++ b/tests/tensorizer_loader/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import functools import gc from typing import Callable, TypeVar diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 6e7eec1c6..b268d4bf0 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import gc import json import os diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index e8f8499aa..17c128a17 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test hashing of cache blocks. Run `pytest tests/test_cache_block_hashing.py`. diff --git a/tests/test_config.py b/tests/test_config.py index ec366b93d..2dfae218b 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import asdict import pytest diff --git a/tests/test_embedded_commit.py b/tests/test_embedded_commit.py index ffeacf34b..a9b4f5cbf 100644 --- a/tests/test_embedded_commit.py +++ b/tests/test_embedded_commit.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import vllm diff --git a/tests/test_inputs.py b/tests/test_inputs.py index fff7c5fc0..fff909154 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/test_logger.py b/tests/test_logger.py index e3749616d..993822e92 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import logging import os diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py index 39c1c3815..487fbb8fc 100644 --- a/tests/test_logits_processor.py +++ b/tests/test_logits_processor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import Tuple from unittest.mock import patch diff --git a/tests/test_regression.py b/tests/test_regression.py index 5d27d3579..f781b3113 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Containing tests that check for regressions in vLLM's behavior. It should include tests that are reported by users and making sure they diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py index 01cbe0c99..40e26ed51 100644 --- a/tests/test_sampling_params.py +++ b/tests/test_sampling_params.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for the SamplingParams class. """ from vllm import SamplingParams diff --git a/tests/test_scalartype.py b/tests/test_scalartype.py index a9221f08c..6e36f2c33 100644 --- a/tests/test_scalartype.py +++ b/tests/test_scalartype.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch diff --git a/tests/test_sequence.py b/tests/test_sequence.py index 30e53a180..902de1099 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.model_executor.layers.sampler import SamplerOutput diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py index 2412da503..088b95be7 100644 --- a/tests/test_sharded_state_loader.py +++ b/tests/test_sharded_state_loader.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import multiprocessing as mp import os import shutil diff --git a/tests/test_utils.py b/tests/test_utils.py index d5dc4464e..5b69ffd18 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os import socket diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py index 4c8238fd8..cd60cefd7 100644 --- a/tests/tokenization/test_cached_tokenizer.py +++ b/tests/tokenization/test_cached_tokenizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from copy import deepcopy from transformers import AutoTokenizer diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 84348cbc0..57832394d 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, Generator, List, Optional import pytest diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py index 875ca19d3..787fb6ea6 100644 --- a/tests/tokenization/test_get_eos.py +++ b/tests/tokenization/test_get_eos.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This test file includes some cases where it is inappropriate to only get the `eos_token_id` from the tokenizer as defined by diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py index 8db7204f1..eddc63098 100644 --- a/tests/tokenization/test_tokenizer.py +++ b/tests/tokenization/test_tokenizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from transformers import PreTrainedTokenizerBase diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py index 3faaf326f..8e99f8691 100644 --- a/tests/tokenization/test_tokenizer_group.py +++ b/tests/tokenization/test_tokenizer_group.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os import sys diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py index 294acf202..39ab01c9b 100644 --- a/tests/tool_use/conftest.py +++ b/tests/tool_use/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import pytest_asyncio from huggingface_hub import snapshot_download diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py index 3d0fe8f06..7bee56281 100644 --- a/tests/tool_use/test_chat_completion_request_validations.py +++ b/tests/tool_use/test_chat_completion_request_validations.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.entrypoints.openai.protocol import ChatCompletionRequest diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py index 75bbfbb76..da033fa1d 100644 --- a/tests/tool_use/test_chat_completions.py +++ b/tests/tool_use/test_chat_completions.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import openai diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py index 3095ef451..7e349c512 100644 --- a/tests/tool_use/test_jamba_tool_parser.py +++ b/tests/tool_use/test_jamba_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json from typing import Generator, List, Optional diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py index c294cb049..b49a5e8e7 100644 --- a/tests/tool_use/test_parallel_tool_calls.py +++ b/tests/tool_use/test_parallel_tool_calls.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json from typing import Dict, List, Optional diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py index fe8cb496c..45f1bfc45 100644 --- a/tests/tool_use/test_tool_calls.py +++ b/tests/tool_use/test_tool_calls.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json from typing import Dict, List, Optional diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index 2241f1846..a7dfb1078 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from copy import deepcopy from typing import Any, Dict, List, Optional diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py index b7124ebc1..6ed83f30e 100644 --- a/tests/tpu/test_compilation.py +++ b/tests/tpu/test_compilation.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import glob import os import tempfile diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py index bb1379deb..e94bbd287 100644 --- a/tests/tpu/test_custom_dispatcher.py +++ b/tests/tpu/test_custom_dispatcher.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from vllm.config import CompilationLevel diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py index 6cd5615c4..3db9bc73a 100644 --- a/tests/tpu/test_quantization_accuracy.py +++ b/tests/tpu/test_quantization_accuracy.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass import lm_eval diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index 49a16d16e..592775e8b 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import threading from concurrent import futures diff --git a/tests/utils.py b/tests/utils.py index f4eecf19e..3b32052fe 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import copy import functools diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 0a5ba1f98..60cf4384d 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.multimodal.inputs import MultiModalKwargs diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 5c1cda285..2e16d7d25 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the with and without prefix caching.""" import pytest diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py index 8ec9f1ba3..a8079dcce 100644 --- a/tests/v1/e2e/test_cascade_attention.py +++ b/tests/v1/e2e/test_cascade_attention.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 10f783b21..4b5bc9ced 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from contextlib import ExitStack from typing import List, Tuple diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py index ff38a4568..a3540582a 100644 --- a/tests/v1/engine/test_engine_args.py +++ b/tests/v1/engine/test_engine_args.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm import envs diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 033bbcfce..6a91f1901 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time import uuid diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index e2c728b22..b2539132f 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import time import uuid diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 4735c6f94..5782a249f 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py index 5ebf72927..f7eedcb9c 100644 --- a/tests/v1/sample/test_sampler.py +++ b/tests/v1/sample/test_sampler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Set, Tuple import numpy as np diff --git a/tests/v1/test_stats.py b/tests/v1/test_stats.py index 580392ac5..48419d8a2 100644 --- a/tests/v1/test_stats.py +++ b/tests/v1/test_stats.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.sampling_params import SamplingParams diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py index ac773b611..9b669ae00 100644 --- a/tests/v1/test_utils.py +++ b/tests/v1/test_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import torch diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 694ce81ff..5b40fbff8 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List, Set, Tuple import numpy as np diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py index 790e891ec..c03943149 100644 --- a/tests/vllm_test_utils/setup.py +++ b/tests/vllm_test_utils/setup.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from setuptools import setup setup( diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py index 6505c8154..1d1219fbe 100644 --- a/tests/vllm_test_utils/vllm_test_utils/__init__.py +++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ vllm_utils is a package for vLLM testing utilities. It does not import any vLLM modules. diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py index 1ddd3471d..392fd2705 100644 --- a/tests/vllm_test_utils/vllm_test_utils/blame.py +++ b/tests/vllm_test_utils/vllm_test_utils/blame.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import contextlib import dataclasses import sys diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py index a237f53a7..44d45f262 100644 --- a/tests/vllm_test_utils/vllm_test_utils/monitor.py +++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import contextlib import dataclasses import sys diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py index 7a3786456..e456bfab8 100644 --- a/tests/weight_loading/test_weight_loading.py +++ b/tests/weight_loading/test_weight_loading.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index a6b3cb575..0ce0465a7 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools from typing import List diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py index 57f1fd47a..eb341fb1b 100644 --- a/tests/worker/test_model_input.py +++ b/tests/worker/test_model_input.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from typing import List, Tuple, Type diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index aabe913c2..c32ceb4fa 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py index 79233c757..22466105b 100644 --- a/tests/worker/test_profile.py +++ b/tests/worker/test_profile.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from vllm.engine.arg_utils import EngineArgs diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index acede959f..7ae0f4bb8 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from vllm.engine.arg_utils import EngineArgs diff --git a/tools/check_spdx_header.py b/tools/check_spdx_header.py new file mode 100644 index 000000000..3f7fd66bf --- /dev/null +++ b/tools/check_spdx_header.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: Apache-2.0 + +import sys + +SPDX_HEADER = "# SPDX-License-Identifier: Apache-2.0" +SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:" + + +def check_spdx_header(file_path): + with open(file_path, encoding='UTF-8') as file: + lines = file.readlines() + if not lines: + # not necessary for an empty file like __init__.py + return True + if not lines[0].strip().startswith(SPDX_HEADER_PREFIX): + return False + return True + + +def add_header(file_path): + with open(file_path, 'r+', encoding='UTF-8') as file: + lines = file.readlines() + file.seek(0, 0) + file.write(SPDX_HEADER + '\n\n' + ''.join(lines)) + + +def main(): + files_with_missing_header = [] + for file_path in sys.argv[1:]: + if not check_spdx_header(file_path): + files_with_missing_header.append(file_path) + + if files_with_missing_header: + print("The following files are missing the SPDX header:") + for file_path in files_with_missing_header: + print(f" {file_path}") + add_header(file_path) + + sys.exit(1 if files_with_missing_header else 0) + + +if __name__ == "__main__": + main() diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py index 54cd60c2b..adbb7301b 100644 --- a/tools/profiler/print_layerwise_table.py +++ b/tools/profiler/print_layerwise_table.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import json from typing import Dict diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py index cb56ebd69..c527cdbe0 100644 --- a/tools/profiler/visualize_layerwise_profile.py +++ b/tools/profiler/visualize_layerwise_profile.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import copy import json diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py index 9dc19f5fd..33e85b9ff 100644 --- a/tools/report_build_time_ninja.py +++ b/tools/report_build_time_ninja.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + #!/usr/bin/env python3 # Copyright (c) 2018 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be diff --git a/use_existing_torch.py b/use_existing_torch.py index 319d26289..a578328b0 100644 --- a/use_existing_torch.py +++ b/use_existing_torch.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import glob requires_files = glob.glob('requirements*.txt') diff --git a/vllm/__init__.py b/vllm/__init__.py index 2aabe820d..566c5116d 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" import os diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index da237da2e..ce4f75341 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import contextlib import importlib from typing import TYPE_CHECKING, List, Optional, Tuple, Union diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 28b804f76..ccb67baa5 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple import torch diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py index 3ed60678b..18e0c5227 100644 --- a/vllm/adapter_commons/layers.py +++ b/vllm/adapter_commons/layers.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Tuple diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py index 468904c90..f9a5d2fff 100644 --- a/vllm/adapter_commons/models.py +++ b/vllm/adapter_commons/models.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import Any, Callable, Dict, Optional, TypeVar diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py index 2bb17fdc0..2b604b91b 100644 --- a/vllm/adapter_commons/request.py +++ b/vllm/adapter_commons/request.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py index 1e9adca50..c2dc5433c 100644 --- a/vllm/adapter_commons/utils.py +++ b/vllm/adapter_commons/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Callable, Dict, Optional, Set diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py index 83929e82e..ce24e08a5 100644 --- a/vllm/adapter_commons/worker_manager.py +++ b/vllm/adapter_commons/worker_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import Any, Optional, Set diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py index a46c67ad7..d9e51082e 100644 --- a/vllm/assets/audio.py +++ b/vllm/assets/audio.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Literal from urllib.parse import urljoin diff --git a/vllm/assets/base.py b/vllm/assets/base.py index 249173141..03f3b9dab 100644 --- a/vllm/assets/base.py +++ b/vllm/assets/base.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import lru_cache from pathlib import Path from typing import Optional diff --git a/vllm/assets/image.py b/vllm/assets/image.py index 0a55506f8..2b1d258da 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Literal diff --git a/vllm/assets/video.py b/vllm/assets/video.py index eca2ccc54..494cfc383 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from functools import lru_cache from typing import List, Literal diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py index 2cd4ad3e0..85c5715fa 100644 --- a/vllm/attention/__init__.py +++ b/vllm/attention/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.attention.backends.abstract import (AttentionBackend, AttentionMetadata, AttentionMetadataBuilder, diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index b9425f659..5f0a54013 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from contextlib import contextmanager from dataclasses import dataclass, fields diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index 20e9a3f13..9765e7881 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 4a9aa1e21..6a82127ac 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Attention layer with FlashAttention.""" from collections import defaultdict from dataclasses import dataclass diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 7cccef960..715ed6748 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from collections import defaultdict from contextlib import contextmanager diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index 80c132c0a..1518e518e 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 57916a3c6..b4879af4c 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Attention layer with torch scaled_dot_product_attention and PagedAttention.""" from dataclasses import dataclass diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py index e8fec234c..9b63192ed 100644 --- a/vllm/attention/backends/mla/utils.py +++ b/vllm/attention/backends/mla/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import abstractmethod from dataclasses import dataclass from typing import Any, Dict, Generic, List, Optional, Tuple diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py index be06d1600..f58528dbf 100644 --- a/vllm/attention/backends/openvino.py +++ b/vllm/attention/backends/openvino.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 209a623ba..b61dfe63d 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index 826311896..9f6e731af 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections import defaultdict from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 12110ec73..02bff57a6 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Attention layer ROCm GPUs.""" from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index c3b2398b4..25fe6ed95 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Attention layer with torch scaled_dot_product_attention and PagedAttention.""" from dataclasses import dataclass diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py index 95dc119a4..20d7ef0fa 100644 --- a/vllm/attention/backends/triton_mla.py +++ b/vllm/attention/backends/triton_mla.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 7f2fe7e83..ad53e4e70 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Attention backend utils""" from collections import defaultdict from contextlib import contextmanager diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 49f47f9c8..723a4558d 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Attention layer with xFormers and PagedAttention.""" from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index b97165f62..19ee89630 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Attention layer.""" from typing import Any, Dict, List, Optional diff --git a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py index 727a470ba..71caf3cba 100644 --- a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +++ b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch import triton import triton.language as tl diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py index 350f88c8f..6ab69ea5b 100644 --- a/vllm/attention/ops/blocksparse_attention/interface.py +++ b/vllm/attention/ops/blocksparse_attention/interface.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math import torch diff --git a/vllm/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py index 78d752230..4de9bd530 100644 --- a/vllm/attention/ops/blocksparse_attention/utils.py +++ b/vllm/attention/ops/blocksparse_attention/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Helper functions for 3D sparse pattern # These function are not optimized and very inefficient. # Avoid calling them too frequent or use a cache mechanism. diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py index 4c0fb2a62..8bb536343 100644 --- a/vllm/attention/ops/hpu_paged_attn.py +++ b/vllm/attention/ops/hpu_paged_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py index 3a07184ed..598ceea13 100644 --- a/vllm/attention/ops/ipex_attn.py +++ b/vllm/attention/ops/ipex_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List, Optional, Tuple try: diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py index 9de4ef7f5..68aa63f5a 100644 --- a/vllm/attention/ops/nki_flash_attn.py +++ b/vllm/attention/ops/nki_flash_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass import neuronxcc.nki.isa as nisa diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index fd6232914..2c60bd0c3 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import List, Optional, Tuple diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index ec3c8459c..fbb6757ee 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # The kernels in this file are adapted from LightLLM's context_attention_fwd: # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py index 675df109b..ec5ec4ce6 100644 --- a/vllm/attention/ops/triton_decode_attention.py +++ b/vllm/attention/ops/triton_decode_attention.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py # which was originally adapted from diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index ef04603f2..ab8fb8953 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + #!/usr/bin/env python """ Fused Attention diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 4c6bbc727..26c6ac812 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from contextlib import contextmanager from functools import cache diff --git a/vllm/beam_search.py b/vllm/beam_search.py index 026037e54..97b2b630f 100644 --- a/vllm/beam_search.py +++ b/vllm/beam_search.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 7f4f97466..979890170 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import ast import copy import dataclasses diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py index 6385f1c5d..a6f11a3af 100644 --- a/vllm/compilation/counter.py +++ b/vllm/compilation/counter.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import copy import dataclasses from contextlib import contextmanager diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 17eb0592c..20afe6967 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import inspect from typing import Callable, Dict, List, Optional, TypeVar, Union, overload from unittest.mock import patch diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py index e15d7b315..9b0e9c5d0 100644 --- a/vllm/compilation/fix_functionalization.py +++ b/vllm/compilation/fix_functionalization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import operator from typing import Dict, Iterable, List, Optional, Tuple, Union diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index cde27bd10..0c3d8697b 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, Dict, List, NamedTuple, Optional, Tuple import torch diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py index 924e26f2e..b9a8d3112 100644 --- a/vllm/compilation/fx_utils.py +++ b/vllm/compilation/fx_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import operator from typing import Iterable, Optional diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index f6846c08a..be663946f 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import hashlib import inspect import types diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index b97e40415..786c7c1e1 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import time diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py index b6bcecdc8..e6f6a60b2 100644 --- a/vllm/compilation/multi_output_match.py +++ b/vllm/compilation/multi_output_match.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import abc import operator from abc import abstractmethod diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 34f5f3557..c7387fb7c 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List from torch import fx as fx diff --git a/vllm/compilation/reshapes.py b/vllm/compilation/reshapes.py index ba28b1f0b..292baae85 100644 --- a/vllm/compilation/reshapes.py +++ b/vllm/compilation/reshapes.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Union import torch.fx diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py index b8c52a7f4..1d2597e42 100644 --- a/vllm/compilation/vllm_inductor_pass.py +++ b/vllm/compilation/vllm_inductor_pass.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time import torch diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 58a8fa76f..a8a283ddd 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import sys from abc import abstractmethod diff --git a/vllm/config.py b/vllm/config.py index a13700aba..d2d59c705 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import ast import copy import enum diff --git a/vllm/connections.py b/vllm/connections.py index 4c9f4f40c..dc060bb6f 100644 --- a/vllm/connections.py +++ b/vllm/connections.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from pathlib import Path from typing import Mapping, MutableMapping, Optional from urllib.parse import urlparse diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 90c1438ef..d4d31c58d 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from typing import List, Optional diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py index 115f663e4..1966eac1c 100644 --- a/vllm/core/block/common.py +++ b/vllm/core/block/common.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections import deque from dataclasses import dataclass from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index c3e1665b4..359b5b263 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, FrozenSet, List, Optional, Tuple from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index cb432db91..0b0197deb 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index c38ae2dd6..c388366b8 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections import deque from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index ccdc5daa9..fbf19e1b4 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Token blocks.""" import sys from bisect import bisect_left diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py index 1c6578e4c..910afdd9f 100644 --- a/vllm/core/block/utils.py +++ b/vllm/core/block/utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Block manager utils.""" from vllm.sequence import SequenceGroup from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 2d6a132ed..c5b3b04f3 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A block manager that manages token blocks.""" from typing import Dict, List, Optional from typing import Sequence as GenericSequence diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index c93065182..0e363eddc 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum import heapq from abc import ABC, abstractmethod diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py index 9c7e246e3..b48ba87e9 100644 --- a/vllm/core/interfaces.py +++ b/vllm/core/interfaces.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum from abc import ABC, abstractmethod from typing import List diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py index f9924be4a..70c22afa8 100644 --- a/vllm/core/placeholder_block_space_manager.py +++ b/vllm/core/placeholder_block_space_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Tuple from vllm.core.interfaces import AllocStatus, BlockSpaceManager diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 2bb961481..f507847ad 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum import os import random diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index a43418dbb..f74ad9ac3 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # cumem-based pytorch pluggable allocator to implement sleep mode. # other approaches tried but failed: # - cuda-python package binding diff --git a/vllm/distributed/__init__.py b/vllm/distributed/__init__.py index db325cfab..39955ddac 100644 --- a/vllm/distributed/__init__.py +++ b/vllm/distributed/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .communication_op import * from .parallel_state import * from .utils import * diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index e13505dc3..0228264f9 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, Optional, Union import torch diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py index d5a53381c..010caf7eb 100644 --- a/vllm/distributed/device_communicators/cuda_wrapper.py +++ b/vllm/distributed/device_communicators/cuda_wrapper.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """This file is a pure Python wrapper for the cudart library. It avoids the need to compile a separate shared library, and is convenient for use when we just need to call a few functions. diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 62929dc0f..a2614ed5d 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import ctypes from contextlib import contextmanager from typing import List, Optional, Union diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 1f78e10cc..d8d6eed2d 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import ctypes import json import os diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py index cc9b19ce0..3f85da98a 100644 --- a/vllm/distributed/device_communicators/hpu_communicator.py +++ b/vllm/distributed/device_communicators/hpu_communicator.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch import torch.distributed as dist from torch.distributed import ProcessGroup diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index efc599871..0ccd42312 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Union # ===================== import region ===================== diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 7dea61b6a..03c3b0be7 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # This file is a pure Python wrapper for the NCCL library. # The main purpose is to use NCCL combined with CUDA graph. # Before writing this script, we tried the following approach: diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 268edc092..48ac81ac0 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pickle import sys diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py index 765a0f9cb..7af7c65f6 100644 --- a/vllm/distributed/device_communicators/tpu_communicator.py +++ b/vllm/distributed/device_communicators/tpu_communicator.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import torch diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py index eafd3c2f6..79ccc101e 100644 --- a/vllm/distributed/device_communicators/xpu_communicator.py +++ b/vllm/distributed/device_communicators/xpu_communicator.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch import torch.distributed as dist from torch.distributed import ProcessGroup diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py index 6089e3bab..57c764b48 100644 --- a/vllm/distributed/kv_transfer/kv_connector/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/base.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ KVConnectorBase Class for Distributed KV Cache & Hidden State communication diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 6372dab72..fe4805334 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import importlib from typing import TYPE_CHECKING, Callable, Dict, Type diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py index 7780e2dfa..2033e9762 100644 --- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Simple KV Cache Connector for Distributed Machine Learning Inference diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py index bad119a1a..845da7c50 100644 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This file contains a new class `KVLookupBufferBase` that allows developers to think of KV cache operations as inserting new KV cache entries (`insert`) diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py index fe8d8d737..5e1b62352 100644 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Implements a distributed key-value (KV) cache transfer mechanism. diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py index 4b0cb44cc..40589fb3e 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/base.py +++ b/vllm/distributed/kv_transfer/kv_pipe/base.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This file defines an interface `KVPipeBase` that provides an abstraction for sending and receiving tensors, or None, via diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py index 8e4358672..58ab7f0b6 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import os import pickle diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py index 98222fa67..7aa53d07a 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This module implements a PyNccl pipe for sending and receiving Optional[torch.Tensor] between distributed ranks with advanced diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_transfer_agent.py index 9ce97851d..1e80e0bd7 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_agent.py +++ b/vllm/distributed/kv_transfer/kv_transfer_agent.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A centralized entrypoint to perform distributed KV cache transfer. This implementation is a shim wrapper on two APIs exposed by `kv_connector`: diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 7fe9b68d4..c5c5dfbba 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2023 The vLLM team. # Adapted from # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index dcfcb848c..84f8c0a8e 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2023 The vLLM team. # Adapted from # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cc7c99e50..7c0e8c214 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import dataclasses import json diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 739ea06ae..053635a28 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import copy import time diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py index 4b1842625..aa54c0693 100644 --- a/vllm/engine/async_timeout.py +++ b/vllm/engine/async_timeout.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Workaround for https://github.com/python/cpython/issues/86296 # # From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index dd677300f..d82d9ad9d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import copy import time from collections import Counter as collectionsCounter diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index b771c190d..ce806b4a9 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from typing import TYPE_CHECKING from typing import Counter as CollectionsCounter diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 5c7a430d1..7f0c2fa70 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ These types are defined in this file to avoid importing vllm.engine.metrics and therefore importing prometheus_client. diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index d9703b820..3cf1850ee 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import uuid from dataclasses import dataclass, field from enum import Enum diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 5237f63c3..85b5f31e3 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import copy import pickle diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 166f89743..a0dd79586 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pickle import signal from contextlib import contextmanager diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 50adaf4e5..4c8e295c1 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import Callable, List diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 99c2baf3f..8ceef855e 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import functools from typing import Callable, List, cast diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index 55c56abea..4d96791a1 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List from vllm.config import SchedulerConfig diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 4b701f815..3bca0bee3 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional, Tuple from vllm.lora.request import LoRARequest diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py index 770982a20..0d2b58c10 100644 --- a/vllm/engine/output_processor/util.py +++ b/vllm/engine/output_processor/util.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List from typing import Sequence as GenericSequence from typing import cast diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index de7b2c1b9..d11125586 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from abc import ABC, abstractmethod from typing import AsyncGenerator, List, Mapping, Optional diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index daefbff7e..96818507d 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ NOTE: This API server is used only for demonstrating usage of AsyncEngine and simple performance benchmarks. It is not intended for production use. diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 97d2561df..3a6e75b1d 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import codecs import json diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index 5dcf50bd1..351a39525 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import signal from http import HTTPStatus diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 46b595b0d..d071a0b3c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools import warnings from contextlib import contextmanager diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py index 584ee0d9e..e82b6ba6c 100644 --- a/vllm/entrypoints/logger.py +++ b/vllm/entrypoints/logger.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Union from vllm.logger import init_logger diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 9e5cf4ba2..b8f54d6c7 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import atexit import gc diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 9cfe07c65..3054958f3 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This file contains the command line arguments for the vLLM's OpenAI-compatible server. It is kept in a separate file for documentation diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py index c8132811d..41e5eef40 100644 --- a/vllm/entrypoints/openai/logits_processors.py +++ b/vllm/entrypoints/openai/logits_processors.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import lru_cache, partial from typing import Dict, FrozenSet, Iterable, List, Optional, Union diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 29d071ce5..83b841826 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py import re diff --git a/vllm/entrypoints/openai/reasoning_parsers/__init__.py b/vllm/entrypoints/openai/reasoning_parsers/__init__.py index a21bff52f..80354d69b 100644 --- a/vllm/entrypoints/openai/reasoning_parsers/__init__.py +++ b/vllm/entrypoints/openai/reasoning_parsers/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser diff --git a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py index e5d10ee0b..b5df7e474 100644 --- a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py +++ b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from functools import cached_property from typing import Callable, Dict, List, Optional, Sequence, Tuple, Type, Union diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py index a440ddc8d..5c19888d4 100644 --- a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py +++ b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import re from typing import Optional, Sequence, Tuple, Union diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 37ae23506..675d3cdcf 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from http import HTTPStatus from io import StringIO diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index dc97f0eb0..107220d54 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import json import time diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 13c392636..e7ad263e7 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import time from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index e7116a3d9..45f8ad90d 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import base64 import time diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 8d54164e5..8d39fdcb7 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json from concurrent.futures.thread import ThreadPoolExecutor from http import HTTPStatus diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 22e74b387..f917a4851 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import pathlib from dataclasses import dataclass diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 583032207..01a3d211f 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import base64 import time diff --git a/vllm/entrypoints/openai/serving_rerank.py b/vllm/entrypoints/openai/serving_rerank.py index be4420261..366df7121 100644 --- a/vllm/entrypoints/openai/serving_rerank.py +++ b/vllm/entrypoints/openai/serving_rerank.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 381edf8fa..832aa8516 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import time from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index b67ecfb01..6c79adf90 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Final, List, Optional, Union from fastapi import Request diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 2850349a4..d1c3afa64 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .abstract_tool_parser import ToolParser, ToolParserManager from .granite_20b_fc_tool_parser import Granite20bFCToolParser from .granite_tool_parser import GraniteToolParser diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py index aa7c20109..7cdd6d4c4 100644 --- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from functools import cached_property from typing import Callable, Dict, List, Optional, Sequence, Type, Union diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index 93e357e8b..002bf1738 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import re from json import JSONDecoder diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index 8aefcd8d5..c948ed78f 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json from typing import Dict, Sequence, Union diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index 869d15ac3..4841b2870 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import re from typing import Dict, List, Sequence, Union diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index cb391e11b..b9215e797 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json from typing import Dict, Sequence, Union diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index cfd024853..7c4d63e18 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import re from typing import Dict, List, Sequence, Union diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 1856308b8..6a7b11362 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import re from json import JSONDecoder diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index bada805dd..51354f7c9 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import re from random import choices diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py index 26da4d689..5c282b5c2 100644 --- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import ast import json import re diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py index 5e4eb23bf..945cbd683 100644 --- a/vllm/entrypoints/openai/tool_parsers/utils.py +++ b/vllm/entrypoints/openai/tool_parsers/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json from json import JSONDecodeError, JSONDecoder from typing import Any, List, Tuple diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index e8a78d216..9af37871d 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import functools diff --git a/vllm/envs.py b/vllm/envs.py index 25098070b..78ee3047b 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import tempfile from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 471d1bfac..fb76276bb 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from abc import ABC, abstractmethod from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple, diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py index 78c86321d..d1f8c36fb 100644 --- a/vllm/executor/mp_distributed_executor.py +++ b/vllm/executor/mp_distributed_executor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os from typing import Any, Callable, List, Optional, Union diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index c467115f1..e680d53cb 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from array import array from typing import Any, Type diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index 539b6ae2d..cef6a994a 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os import sys diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index 2afd99f99..80e7a1c40 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os from collections import defaultdict diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index e55155ea0..5d5cc8398 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import time from collections import defaultdict diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index a5c4dcf0e..dcb4a8f27 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import Any, Callable, Dict, List, Optional, Tuple, Union diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 828b394ec..10de8bc59 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from collections import defaultdict from contextlib import contextmanager diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index a0dd89f69..6f8f2cd75 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType, SingletonInputs, SingletonInputsAdapter, SingletonPrompt, diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 57e85779d..2ffebeee3 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from functools import cached_property from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal, diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index 09f1ff2cb..454d9d830 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Literal, Sequence, TypedDict, Union, cast, overload from typing_extensions import TypeIs diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 70372e0ca..4d8f28cb0 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from typing import List, Mapping, Optional, Union diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 4b73ade7a..0ec726b8b 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import functools from collections import UserDict from dataclasses import dataclass diff --git a/vllm/logger.py b/vllm/logger.py index cac174f7b..b20d55e3c 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Logging configuration for vLLM.""" import datetime import json diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py index 576ccf78a..7ab463258 100644 --- a/vllm/logging_utils/__init__.py +++ b/vllm/logging_utils/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.logging_utils.formatter import NewLineFormatter __all__ = [ diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py index b24b4e11d..010b0a124 100644 --- a/vllm/logging_utils/formatter.py +++ b/vllm/logging_utils/formatter.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import logging diff --git a/vllm/logits_process.py b/vllm/logits_process.py index 7716ccd27..d02072e8f 100644 --- a/vllm/logits_process.py +++ b/vllm/logits_process.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Tuple, Union import torch diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 545ec21ca..3d6620817 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # pylint: disable=unused-argument from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index cdd439d03..9f0297596 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # pylint: disable=unused-argument import math from dataclasses import dataclass diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py index 93ad4651f..00299bf6c 100644 --- a/vllm/lora/lora.py +++ b/vllm/lora/lora.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional from typing import Sequence as GenericSequence diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 2e04cb902..ef77fd4b7 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import copy import math import os diff --git a/vllm/lora/ops/torch_ops/__init__.py b/vllm/lora/ops/torch_ops/__init__.py index 9c9159b95..85601d58c 100644 --- a/vllm/lora/ops/torch_ops/__init__.py +++ b/vllm/lora/ops/torch_ops/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand # noqa: F401 from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink, sgmv_expand, sgmv_expand_slice, diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py index 5f5aafd51..af79f9841 100644 --- a/vllm/lora/ops/torch_ops/lora_ops.py +++ b/vllm/lora/ops/torch_ops/lora_ops.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py index 9805b6dd5..dc440f732 100644 --- a/vllm/lora/ops/triton_ops/__init__.py +++ b/vllm/lora/ops/triton_ops/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink diff --git a/vllm/lora/ops/triton_ops/bgmv_expand.py b/vllm/lora/ops/triton_ops/bgmv_expand.py index 42adb191b..98510b396 100644 --- a/vllm/lora/ops/triton_ops/bgmv_expand.py +++ b/vllm/lora/ops/triton_ops/bgmv_expand.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/ops/triton_ops/bgmv_expand_slice.py b/vllm/lora/ops/triton_ops/bgmv_expand_slice.py index f397d752a..48804123c 100644 --- a/vllm/lora/ops/triton_ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/triton_ops/bgmv_expand_slice.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/ops/triton_ops/bgmv_shrink.py b/vllm/lora/ops/triton_ops/bgmv_shrink.py index f3ef01d39..227a5765e 100644 --- a/vllm/lora/ops/triton_ops/bgmv_shrink.py +++ b/vllm/lora/ops/triton_ops/bgmv_shrink.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py index 48fa5cd63..a8e71cacf 100644 --- a/vllm/lora/ops/triton_ops/sgmv_expand.py +++ b/vllm/lora/ops/triton_ops/sgmv_expand.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py index 9bb35e8ff..8b26583c1 100644 --- a/vllm/lora/ops/triton_ops/sgmv_shrink.py +++ b/vllm/lora/ops/triton_ops/sgmv_shrink.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py index 7df5bc2c2..78409b91a 100644 --- a/vllm/lora/ops/triton_ops/utils.py +++ b/vllm/lora/ops/triton_ops/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import functools from typing import Dict, List, Tuple diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index b9c506f6e..9496ab5a7 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py import json diff --git a/vllm/lora/punica_wrapper/__init__.py b/vllm/lora/punica_wrapper/__init__.py index 48ada3926..915fc6623 100644 --- a/vllm/lora/punica_wrapper/__init__.py +++ b/vllm/lora/punica_wrapper/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index b9ec0c4bc..1a2282ae9 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py index b9ae3e074..29428f4cf 100644 --- a/vllm/lora/punica_wrapper/punica_cpu.py +++ b/vllm/lora/punica_wrapper/punica_cpu.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, Optional, Tuple, Union import torch diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index 451f23e49..9ccd9c36a 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py index d9c4f44a1..51e1bfab3 100644 --- a/vllm/lora/punica_wrapper/punica_hpu.py +++ b/vllm/lora/punica_wrapper/punica_hpu.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Tuple, Union, final import torch diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py index a29322465..ad5d4b788 100644 --- a/vllm/lora/punica_wrapper/punica_selector.py +++ b/vllm/lora/punica_wrapper/punica_selector.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils import resolve_obj_by_qualname diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py index 7360c8c09..dbc2d27c5 100644 --- a/vllm/lora/punica_wrapper/utils.py +++ b/vllm/lora/punica_wrapper/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import TYPE_CHECKING, List, Optional, Tuple, Union import torch diff --git a/vllm/lora/request.py b/vllm/lora/request.py index 5e3d2f0ed..badfaa419 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import warnings from typing import Optional diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index d72b7638d..f47b0af15 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import re from typing import List, Optional, Set, Tuple, Type, Union diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index a64296f7f..f33a7b88c 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from contextlib import contextmanager from typing import Any, Dict, List, Literal, Optional, Set, Type, Union diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py index 7278c7fbe..763615217 100644 --- a/vllm/model_executor/__init__.py +++ b/vllm/model_executor/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.model_executor.parameter import (BasevLLMParameter, PackedvLLMParameter) from vllm.model_executor.sampling_metadata import (SamplingMetadata, diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 96995c56b..ee4f41ea6 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, Type import torch.nn as nn diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index 18b435a42..cf96461a5 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from __future__ import annotations from typing import TYPE_CHECKING diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py index 8deb4c949..db4ce2680 100644 --- a/vllm/model_executor/guided_decoding/guided_fields.py +++ b/vllm/model_executor/guided_decoding/guided_fields.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Dict, List, Optional, TypedDict, Union diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py index a17e75a80..7eaf9e38e 100644 --- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import lru_cache from json import loads as json_loads from typing import Optional, Union diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py index eb8db8824..ba9c98290 100644 --- a/vllm/model_executor/guided_decoding/outlines_decoding.py +++ b/vllm/model_executor/guided_decoding/outlines_decoding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import concurrent.futures import os diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index e4eb3f16e..ab72b55a8 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024- the Outlines developers # This file is adapted from # https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py index 90dfa62ec..87ef45358 100644 --- a/vllm/model_executor/guided_decoding/utils.py +++ b/vllm/model_executor/guided_decoding/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import re diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index ee30ce96f..c01bd3af1 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # noqa: UP007 from __future__ import annotations diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index fb9684ac1..f782920d0 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Custom activation functions.""" import math from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index c4223d126..6f933c3fa 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from contextlib import contextmanager from typing import Any, Dict, Optional diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 87993267c..4ca569ca4 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Fused MoE utilities for GPTQ.""" import functools from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index c80e6bf07..9613696a0 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Fused MoE kernel.""" import functools import json diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index da0ce1885..3c7ef5e00 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import abstractmethod from enum import Enum from typing import Callable, List, Optional, Tuple diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py index 563ee18c6..0365afa10 100644 --- a/vllm/model_executor/layers/fused_moe/moe_pallas.py +++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch import torch.nn.functional as F from torch_xla.experimental.custom_kernel import _histogram diff --git a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py index bcff55f4f..d9a5de1b3 100644 --- a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +++ b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch import torch.nn.functional as F diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 43ea4eb5a..b476fb0db 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Custom normalization layers.""" from typing import Optional, Tuple, Union diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 52263e96f..08f1e103e 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools from abc import abstractmethod from typing import Dict, List, Optional, Tuple diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 42decde1d..ebf74c67d 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A layer that compute logits from hidden_stats.""" import inspect from typing import Optional diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 606c796d5..93c3cc91b 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from torch import nn from torch.nn.parameter import Parameter diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py index be5639df9..21e27160f 100644 --- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py +++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright (c) 2024, Tri Dao. # Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py index 1484b7981..3c35f1ac0 100644 --- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py +++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright (c) 2024, Tri Dao, Albert Gu. # Adapted from https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/triton/selective_state_update.py diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 75bf33dc7..0012636ef 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from enum import IntEnum from typing import List, Optional, Union diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index bd0fd4799..6ded3874f 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List, Type from vllm.model_executor.layers.quantization.base_config import ( diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 72c89fe2b..6c08d016c 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Supports AQLM compression, see https://github.com/Vahe1994/AQLM # and https://arxiv.org/pdf/2401.06118.pdf diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index d83528e9e..ff77af44d 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 0c3c98168..8849ba292 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Callable, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py index ace8f4a34..09efd4dbd 100644 --- a/vllm/model_executor/layers/quantization/awq_triton.py +++ b/vllm/model_executor/layers/quantization/awq_triton.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch import triton import triton.language as tl diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 2fb2642dd..2eefcc4f3 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import inspect from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Type diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 5dc872933..889eda009 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 37981ed91..24f7542e1 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from contextlib import suppress from typing import Any, Dict, List, Literal, Optional, Tuple, cast diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index e1c45f4e4..db8e8a4b6 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum from enum import Enum from typing import Callable, List, Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py index 569ecaa6f..b26c74f24 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .compressed_tensors_scheme import CompressedTensorsScheme from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS, CompressedTensorsW4A16Sparse24) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py index 21e6fe7a2..84f924b23 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py index b4bab33e1..daa25d23a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py index 2e1b5e3c2..535ea6b32 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py index 1671a23d7..5c8261908 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 1d4e4bd52..5dcc41a9e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 0e3f47317..08d86a4e5 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional, Set import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index 2dd243b9c..38df09ff3 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional, Set import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py index f4c1dbc03..b69c5e7a0 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Type import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 34996b08e..d700a0b15 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import re from typing import Iterable, Optional diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py index 36598b3e2..b41236501 100644 --- a/vllm/model_executor/layers/quantization/deepspeedfp.py +++ b/vllm/model_executor/layers/quantization/deepspeedfp.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 100cbfa4c..87fbcf62a 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Callable, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py index 7b71e13b5..da5ef36c5 100644 --- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py +++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index adab1973b..86e025310 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Callable, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index f0943efa0..86e6dbb5a 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import gguf diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index abafad0f1..0cb77a754 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum from enum import Enum from fractions import Fraction diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 4dc4b052b..99ab29995 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Callable, Dict, List, Optional, Set, Union import torch diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py index 07552c0f1..cec984483 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py index 28538d299..432f43688 100644 --- a/vllm/model_executor/layers/quantization/hqq_marlin.py +++ b/vllm/model_executor/layers/quantization/hqq_marlin.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index c16a96213..2531170ec 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py index 915bdc477..c06befaf3 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Callable, Optional, Tuple diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py index 83549870e..bcfdb1677 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Type import vllm.envs as envs diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py index 1d85d62ec..2706fbb53 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Tuple import torch diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py index 15df0200f..3f0586f6e 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import partial from typing import Optional, Tuple diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py index 6969583d6..e21801cf6 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Tuple import torch diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py index c4a83b4fa..91e765405 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Optional, Tuple diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index 4824a1180..a5967995a 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import Dict, List, Optional, Type diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py index 2e83a0428..2bf21a05c 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Tuple import torch diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py index 97ec8cb05..5da5df8ef 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Tuple import torch diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py index 9de668e65..0bf090d7f 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import warnings from typing import Optional, Tuple diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index e1870c73c..388a4f166 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from vllm.logger import init_logger diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index 20212e672..4cf0c677c 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index a1b3eeb43..348e9bccd 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 11a9d4ac5..1ae765a22 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Callable, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py index 2d5cdfa16..a8e8be207 100644 --- a/vllm/model_executor/layers/quantization/neuron_quant.py +++ b/vllm/model_executor/layers/quantization/neuron_quant.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from importlib.util import find_spec from typing import Any, Dict, List, Optional diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py index 2ccd08202..6e9d3dc6c 100644 --- a/vllm/model_executor/layers/quantization/qqq.py +++ b/vllm/model_executor/layers/quantization/qqq.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index fc214255e..0451cf82b 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import fnmatch import re from typing import Any, Dict, List, Optional, cast diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 68a395454..98743b15e 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Callable, Dict, Optional import torch diff --git a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py index fb0ba9bd5..9069b5a0d 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .quark_scheme import QuarkScheme from .quark_w8a8_fp8 import QuarkW8A8Fp8 from .quark_w8a8_int8 import QuarkW8A8Int8 diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py index 239597fa4..40c8ea86d 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import Optional diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py index 206931ea2..c885e98a4 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py index 8cb47e9c3..1bf34b098 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional, Set import torch diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py index 742a629bd..afb1d9d63 100644 --- a/vllm/model_executor/layers/quantization/quark/utils.py +++ b/vllm/model_executor/layers/quantization/quark/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import re from typing import Any, Iterable, Optional diff --git a/vllm/model_executor/layers/quantization/schema.py b/vllm/model_executor/layers/quantization/schema.py index a26c52478..026881f2d 100644 --- a/vllm/model_executor/layers/quantization/schema.py +++ b/vllm/model_executor/layers/quantization/schema.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This file contains the Pydantic schemas for various quantization-related parameters. When a relevant quantization technique is specified, these diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py index 605c3a386..3234fecaa 100644 --- a/vllm/model_executor/layers/quantization/tpu_int8.py +++ b/vllm/model_executor/layers/quantization/tpu_int8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional, Tuple import torch diff --git a/vllm/model_executor/layers/quantization/utils/__init__.py b/vllm/model_executor/layers/quantization/utils/__init__.py index e60f0c79a..f7ee47288 100644 --- a/vllm/model_executor/layers/quantization/utils/__init__.py +++ b/vllm/model_executor/layers/quantization/utils/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .layer_utils import replace_parameter, update_tensor_inplace __all__ = ['update_tensor_inplace', 'replace_parameter'] diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 850820f66..29c7268ad 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from https://github.com/sgl-project/sglang/pull/2575 import functools import json diff --git a/vllm/model_executor/layers/quantization/utils/layer_utils.py b/vllm/model_executor/layers/quantization/utils/layer_utils.py index edce6d19b..5acae7ca3 100644 --- a/vllm/model_executor/layers/quantization/utils/layer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/layer_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Union import torch diff --git a/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py index 18e133205..cb7d49ed6 100644 --- a/vllm/model_executor/layers/quantization/utils/machete_utils.py +++ b/vllm/model_executor/layers/quantization/utils/machete_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple import torch diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index c9366ca97..3beba3083 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple import numpy diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index 245fe9238..6120a8e66 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional import torch diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py index 4a06c5d63..fb557a313 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Utility functions used for tests and benchmarks""" from typing import List, Optional diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py index 17d09055b..3654268e2 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Utility functions used for tests and benchmarks""" import random diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py index cb58eb945..176b2947a 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import numpy diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index 95e785dcc..62484f62f 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """This file is used for /tests and /benchmarks""" from typing import List, Optional, Tuple diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 3af3b3e0e..3fd88e875 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple, Union import torch diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 9d6c3797c..62e27b714 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import cached_property from importlib.util import find_spec from typing import Dict, Optional, Tuple diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py index a67713c32..4c9860006 100644 --- a/vllm/model_executor/layers/resampler.py +++ b/vllm/model_executor/layers/resampler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index d071cfe88..814c3b7d9 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 8dc26309d..6af734be5 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A layer that samples the next tokens from the model's outputs.""" import itertools import warnings diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py index 6aa4b8bd3..35c7ffec2 100644 --- a/vllm/model_executor/layers/spec_decode_base_sampler.py +++ b/vllm/model_executor/layers/spec_decode_base_sampler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import abstractmethod from typing import Dict, Optional, Union diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py index 584cf971d..95362c280 100644 --- a/vllm/model_executor/layers/typical_acceptance_sampler.py +++ b/vllm/model_executor/layers/typical_acceptance_sampler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch import torch.jit diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index f6f34cd49..dfe71028c 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Utility methods for model layers.""" from typing import Tuple diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index f230efaca..e409094dd 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import List, Optional, Sequence, Tuple diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py index 12468997e..9048c70c7 100644 --- a/vllm/model_executor/model_loader/__init__.py +++ b/vllm/model_executor/model_loader/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from torch import nn from vllm.config import VllmConfig diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 4be511d12..809af81d7 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # ruff: noqa: SIM117 import collections import copy diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py index a90fbd648..d900fb3a7 100644 --- a/vllm/model_executor/model_loader/neuron.py +++ b/vllm/model_executor/model_loader/neuron.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Utilities for selecting and loading neuron models.""" import copy import importlib diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py index e6299295c..7bd531c56 100644 --- a/vllm/model_executor/model_loader/openvino.py +++ b/vllm/model_executor/model_loader/openvino.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # ruff: noqa: SIM117 from pathlib import Path from typing import List, Optional, Tuple diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 9266ca75d..117251ccf 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import dataclasses import io diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 3f923d2f6..084ca53b1 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Utilities for selecting and loading models.""" import contextlib from dataclasses import dataclass, field diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index e4d103f7c..cade0a1dd 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Utilities for downloading and initializing model weights.""" import fnmatch import glob diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index a3ef9adad..6be4a8341 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal, SupportsPP, has_inner_state, supports_lora, supports_multimodal, supports_pp) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 55e90b9d4..3e1daa773 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections.abc import Iterable from typing import TYPE_CHECKING, Any, Optional, TypeVar diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index fd6b5659d..d015682aa 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Inference-only Snowflake Arctic model.""" from typing import Iterable, List, Optional, Set, Tuple, Union diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 8c6873de1..97502c38b 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, Union) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index a923ed36a..5dfaa727b 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 57eb5adc8..204c48d0d 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Derived from BART implementation posted on HuggingFace; license below: # # coding=utf-8 diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 4be136543..4d0f5ac8e 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Iterable, List, Optional, Set, Tuple import torch diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 987dfaf44..bedbdceb7 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Minimal implementation of BlipVisionModel intended to be only used within a vision language model.""" from typing import Iterable, Optional, Set, Tuple, Union diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index b559ac677..2b0452222 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index fee74f491..229677ae7 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index e834c9004..9061a3128 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import cached_property from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index d5f9b4d19..b81a9e917 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/THUDM/CogAgent """Inference-only CogAgent model compatible with THUDM weights.""" diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index dd69f6c9a..1e784f5b4 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Minimal implementation of CLIPVisionModel intended to be only used within a vision language model.""" from typing import Iterable, List, Optional, Set, Tuple, Union diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 989056bf5..e73627da0 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index b2aa3c070..bb3f4f40d 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Iterable, List, Optional, Set, Tuple, Union import torch diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py index c55185395..b239b642f 100644 --- a/vllm/model_executor/models/decilm.py +++ b/vllm/model_executor/models/decilm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 DeciAI Research Team. All rights reserved. diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 74b6bfdf2..9599e1df6 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 73388cd26..f5fede4d8 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py index 06ea3dab9..a4829aa1a 100644 --- a/vllm/model_executor/models/deepseek_v3.py +++ b/vllm/model_executor/models/deepseek_v3.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 344832d8b..1343b9762 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py """Inference-only Deepseek-VL2 model compatible with HuggingFace weights.""" import math diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index 948560b49..373a728be 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Iterable, List, Optional, Tuple import torch diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index bc3295da7..2eb91a682 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py # Copyright 2024 The LG U+ CTO AI Tech Lab. diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py index b93a68680..310aca999 100644 --- a/vllm/model_executor/models/fairseq2_llama.py +++ b/vllm/model_executor/models/fairseq2_llama.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 The vLLM team. # Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved. # diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index c503a368e..01b66a1c2 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 3a5fe8e1f..4a1ad5f4e 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from typing import Iterable, List, Optional, Set, Tuple diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index dbf9da50c..6d8c82968 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py # Copyright 2023 The vLLM team. # Copyright 2023 HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index b23aba829..cb81aa41e 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2023 The vLLM team. # Copyright (c) Google Inc. # diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index f0dc76939..a6dc8f847 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 The vLLM team. # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py index 942d1e14b..5f1903345 100644 --- a/vllm/model_executor/models/glm.py +++ b/vllm/model_executor/models/glm.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Inference-only HF format GLM-4 model compatible with THUDM weights.""" from vllm.config import VllmConfig from vllm.model_executor.models.llama import LlamaForCausalLM diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py index 51922e6f2..4449eb8e8 100644 --- a/vllm/model_executor/models/glm4_vision_encoder.py +++ b/vllm/model_executor/models/glm4_vision_encoder.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/THUDM/GLM-4 """Inference-only GLM-4v model visual encoder compatible with THUDM weights.""" diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 2f1aa2d68..7ad9a24dc 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index c64bc7068..887a44474 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 08298cc0d..815aba145 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 731642772..550ca3f7c 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 543b4e2f5..85911a0f4 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index cdf9414d5..8ae661bf1 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index d179d6235..7bda54ea7 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from array import array from typing import List, Optional, Union diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index df7e768fe..91c89b159 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py # -------------------------------------------------------- diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 4e42a4b6f..f9c2175b2 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py # Copyright 2024 The vLLM team. # Copyright 2024 the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index d16a77f86..9e2e677a6 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 the HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index c5fd0d933..0fc5c4db1 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional, Protocol, Type, Union, overload, runtime_checkable) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 37b91a803..c5f7be135 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import (TYPE_CHECKING, List, Optional, Protocol, Type, Union, overload, runtime_checkable) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 8ad009d51..0499f339b 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py # -------------------------------------------------------- # InternVL diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 28c23edd4..c211ca5f4 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import partial from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 93ac2dcf8..106c3b6b7 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple, Union import torch diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index f4b7e4478..c46a867a7 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py # -------------------------------------------------------- # InternVL diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 8c81dff6b..72bcef5e2 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 890b5530b..d82c08152 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Inference-only Jamba model.""" from typing import Iterable, List, Optional, Set, Tuple diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index e7c264c04..d91c8782a 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 296af2aac..de3777cad 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import abstractmethod from functools import cached_property from typing import (Final, Iterable, List, Literal, Mapping, Optional, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index fda4f22d3..185edcb8d 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import abstractmethod from functools import cached_property from typing import (Final, Iterable, List, Literal, Mapping, Optional, diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 5be85d7c0..a50025135 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 5b0f35b08..ac502000c 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from functools import cached_property from typing import (Final, Iterable, List, Literal, Mapping, Optional, diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 553bc9c28..5034b3345 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """PyTorch MAMBA model.""" from typing import Iterable, List, Optional, Set, Tuple diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py index 79393421f..353177f78 100644 --- a/vllm/model_executor/models/mamba_cache.py +++ b/vllm/model_executor/models/mamba_cache.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Dict, List diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index 66bdcb89a..a19d7da56 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Iterable, List, Optional, Set, Tuple import torch diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 6254d26c7..29473f5bb 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 5e1e6c6fa..878f0c895 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2024 The ModelBest team. diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index eb4282d62..f1c168076 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index bf967d33a..6964d6bdc 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index fbb3704fa..70880eb75 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 7a9b8cd88..fdc438917 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index f7f9d7a18..d1cb04cdb 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 the HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index f1d796ca2..cf4123a2c 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from typing import Iterable, List, Set, Tuple diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py index a9102a607..23814e632 100644 --- a/vllm/model_executor/models/module_mapping.py +++ b/vllm/model_executor/models/module_mapping.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 5c7ae0dee..b524a1497 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math import re from array import array diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 123581641..676c96062 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main import math from typing import Iterable, List, Optional, Set, Tuple, Union diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 2340283b6..6f0b831ac 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index df4fd0a32..2aa04bd71 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py # -------------------------------------------------------- # NVLM-D diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 538e31ec9..3b470dfdd 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py # Copyright 2024 The vLLM team. diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index a35c911f9..4b0455098 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py # Copyright 2024 The vLLM team. diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index fbe5d1aee..d6e24c6d6 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index ea1185aa8..ad1d66902 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index a3757b5c8..f4f5cdff6 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py # Copyright (c) OrionStar Inc. diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 5a28b1ffb..65d810dc2 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 14dd4b5b1..6a80bea34 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py # Copyright 2023 The vLLM team. # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 59b7508a3..6b05bfee9 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py index 34141511e..8f84e0726 100644 --- a/vllm/model_executor/models/phi3.py +++ b/vllm/model_executor/models/phi3.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from llama.py """Inference-only Phi3 model code inherit from Llama.py""" diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index f47676b93..a8b7e9b2a 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from typing import Iterable, List, Optional, Set, Tuple, Union diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 0fcda81da..f089fa5d2 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 The vLLM team. # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 6367b770a..aa4bb52c4 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 37b9989e4..003e9c84c 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from dataclasses import dataclass, fields from functools import cached_property diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 86a9d3089..d7f6662bc 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py # Copyright (c) Alibaba Cloud. diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 82de1c357..e3de6b64f 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py # Copyright 2024 The Qwen team. diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index fc5aed5c9..cf104ab00 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 95de6c218..35d9854a5 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py # Copyright 2024 The Qwen team. diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 593ce4857..00e4159e2 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py # Copyright 2024 The Qwen team. diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index a2778ee73..189ac41e8 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py # Copyright 2024 The Qwen team. diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index de05bf2b7..40bbc7d16 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Whenever you add an architecture to this page, please also update `tests/models/registry.py` with example HuggingFace models for it. diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 5997a7689..742e63a06 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools from typing import Iterable, List, Optional, Tuple diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 1e5101897..a81462f6f 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Implementation of SiglipVisionModel intended to be only used within a vision language model.""" diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index e6d919f23..6215ed814 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index c9d1af782..a5d443266 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. # All rights reserved. # diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 1cd0dedfe..01ea43666 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index 02ca7fe08..a38035e37 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2023 The vLLM team. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 605a0ecf4..5e86b15db 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py """PyTorch Ultravox model.""" import math diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 01a232fdc..fff4be34d 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools from dataclasses import dataclass, field from typing import (Callable, Dict, Iterable, List, Literal, Mapping, Optional, diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 57166f05c..0d67ee7bb 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import Final, Generic, Optional, Protocol, TypeVar, Union diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 15e35fa9c..2319c3160 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, Union) diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py index a9ce8af15..2b1294bf7 100644 --- a/vllm/model_executor/parameter.py +++ b/vllm/model_executor/parameter.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from fractions import Fraction from typing import Callable, Optional, Union diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py index b86cafce8..dea8b0e9d 100644 --- a/vllm/model_executor/pooling_metadata.py +++ b/vllm/model_executor/pooling_metadata.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Any, Dict, List, Tuple diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 61e8881b6..0a580a4e9 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from array import array from dataclasses import dataclass from typing import Dict, List, Optional, Tuple diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 6f1cc9d5e..04f922dfd 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Utils for model executor.""" from typing import Any, Dict, Optional diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 1d7f5d57f..741bd1a6a 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .base import MultiModalPlaceholderMap, MultiModalPlugin from .hasher import MultiModalHashDict, MultiModalHasher from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins, diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index de80f22ba..f379ec168 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import base64 from io import BytesIO from pathlib import Path diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index fd3ec7e0e..c48d07ba3 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from collections import defaultdict from pathlib import Path diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index 24aa1ca65..7d277fd67 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pickle from typing import TYPE_CHECKING, Iterable, Mapping, Optional diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index da13a381c..98ac8057e 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import base64 from functools import lru_cache from io import BytesIO diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index b35184f68..eb52551bb 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from collections import UserDict, defaultdict from collections.abc import Mapping, Sequence diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index ccff0e857..063f458b2 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from collections import UserDict from collections.abc import Callable, Iterator, Mapping, Sequence diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 750646ac6..2ad42d1c1 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import re from abc import ABC, abstractmethod from collections import defaultdict diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index c68edaff8..953c01000 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from collections.abc import Mapping from dataclasses import dataclass, field diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 7a4b85385..29036691b 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import functools from collections import UserDict from dataclasses import dataclass diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 900bed592..583f53655 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import lru_cache from itertools import groupby from pathlib import Path diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 1ad1f5abc..88f184399 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import base64 from functools import lru_cache, partial from io import BytesIO diff --git a/vllm/outputs.py b/vllm/outputs.py index 25b226528..786380c37 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from dataclasses import dataclass from typing import Dict, Generic, List, MutableSequence, Optional diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index ddbdc43ca..d34b660df 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import logging import traceback from itertools import chain diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 159ea94f9..4e0683b8a 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import TYPE_CHECKING, Optional diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 91dcdff00..44d2506f0 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Code inside this file can safely assume cuda platform, e.g. importing pynvml. However, it should not initialize cuda context. """ diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 0e1c4c0c5..78ddb67bb 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import TYPE_CHECKING, Optional diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 186fa54bf..dc6545c93 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum import platform import random diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 23a7126fb..5a03f5f7a 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import TYPE_CHECKING, Optional from vllm.logger import init_logger diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index 3282c0617..41221de0a 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import TYPE_CHECKING, Optional import torch diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 888852163..cd851c0d8 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from functools import lru_cache from typing import TYPE_CHECKING, Dict, List, Optional diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 494a17633..fffc61bba 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import TYPE_CHECKING, Optional import torch diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 039cdd5ad..81bc85f94 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import TYPE_CHECKING, Optional import torch diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index a78a05491..389cb8728 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import logging import os from typing import Callable, Dict diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index b24b7e91a..061232eb1 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Optional import msgspec diff --git a/vllm/profiler/__init__.py b/vllm/profiler/__init__.py index 3e25f5cc2..00af72b1d 100644 --- a/vllm/profiler/__init__.py +++ b/vllm/profiler/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .layerwise_profile import layerwise_profile __all__ = [ diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py index 29c0edd0e..6351ef63d 100644 --- a/vllm/profiler/layerwise_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import copy from collections import defaultdict from dataclasses import asdict, dataclass, field diff --git a/vllm/profiler/utils.py b/vllm/profiler/utils.py index 033035e43..62b39f510 100644 --- a/vllm/profiler/utils.py +++ b/vllm/profiler/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from typing import Callable, Dict, List, Type, Union diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py index 27a61e692..c2f9f1691 100644 --- a/vllm/prompt_adapter/layers.py +++ b/vllm/prompt_adapter/layers.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Optional diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py index 18a5f86c3..3ba7d0896 100644 --- a/vllm/prompt_adapter/models.py +++ b/vllm/prompt_adapter/models.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import logging import math from typing import Any, Callable, Dict, List, Optional, Type diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py index 775dd11db..dfb8e61d7 100644 --- a/vllm/prompt_adapter/request.py +++ b/vllm/prompt_adapter/request.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import msgspec from vllm.adapter_commons.request import AdapterRequest diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py index 8b2732923..dd179ab93 100644 --- a/vllm/prompt_adapter/utils.py +++ b/vllm/prompt_adapter/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420 import os diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py index ddc1ef893..28dcc1687 100644 --- a/vllm/prompt_adapter/worker_manager.py +++ b/vllm/prompt_adapter/worker_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import logging from typing import Any, Optional, Set, Type diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 605c09b8d..97f9e2129 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Sampling parameters for text generation.""" import copy from dataclasses import dataclass diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py index 20063a5b4..9f6e85920 100644 --- a/vllm/scalar_type.py +++ b/vllm/scalar_type.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import functools import struct from dataclasses import dataclass diff --git a/vllm/scripts.py b/vllm/scripts.py index 8101e6b3a..467cab28f 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # The CLI entrypoint to vLLM. import argparse import os diff --git a/vllm/sequence.py b/vllm/sequence.py index 74320db70..534b9e606 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Sequence and its related classes.""" import copy import enum diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 56fb9ba50..e08ed742a 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from array import array from itertools import chain, count from typing import Iterator, List, Optional, Tuple diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index fe5fd39f4..3948298db 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional import torch diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py index c39e98b6c..dd085ad77 100644 --- a/vllm/spec_decode/interfaces.py +++ b/vllm/spec_decode/interfaces.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from dataclasses import dataclass from typing import List, Optional, Set, Union diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py index 21a58fc42..0b62a988e 100644 --- a/vllm/spec_decode/medusa_worker.py +++ b/vllm/spec_decode/medusa_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import weakref from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index d678f4578..bc0e0a121 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from typing import Callable, Optional, Union diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py index fc41bb82e..bdaf31895 100644 --- a/vllm/spec_decode/mlp_speculator_worker.py +++ b/vllm/spec_decode/mlp_speculator_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Set, Tuple import torch diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py index 3aea2eabb..6275c460e 100644 --- a/vllm/spec_decode/mqa_scorer.py +++ b/vllm/spec_decode/mqa_scorer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.sequence import (ExecuteModelRequest, SequenceData, SequenceGroupMetadata, get_all_seq_ids) from vllm.spec_decode.interfaces import (SpeculativeProposals, diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 32197f8cc..5474917a6 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import copy import weakref from typing import Dict, List, Set, Tuple diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index e906b1789..86390c99c 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import weakref from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py index 28a537593..2bebf80fa 100644 --- a/vllm/spec_decode/proposer_worker_base.py +++ b/vllm/spec_decode/proposer_worker_base.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index c6ff5e52f..a1466ba5d 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Set, Tuple import torch diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 8d6d05cba..8653bece8 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import copy from collections import defaultdict from functools import cached_property diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py index 56540744b..08e773c56 100644 --- a/vllm/spec_decode/target_model_runner.py +++ b/vllm/spec_decode/target_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional from vllm.sequence import SequenceGroupMetadata diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index 6bf7587cd..b538923c0 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Set, Tuple import torch diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index c88820ab2..9c04680a6 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from contextlib import contextmanager from typing import Dict, List, Optional, Sequence, Tuple diff --git a/vllm/tracing.py b/vllm/tracing.py index 72a3f8511..bf069ad84 100644 --- a/vllm/tracing.py +++ b/vllm/tracing.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import Mapping, Optional diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py index eeec029fc..01d5bb4b5 100644 --- a/vllm/transformers_utils/__init__.py +++ b/vllm/transformers_utils/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.envs import VLLM_USE_MODELSCOPE if VLLM_USE_MODELSCOPE: diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 5805f4ad0..1c0f20a6e 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum import json import os diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index f065c5612..c484a755a 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.transformers_utils.configs.chatglm import ChatGLMConfig from vllm.transformers_utils.configs.cohere2 import Cohere2Config from vllm.transformers_utils.configs.dbrx import DbrxConfig diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py index 7780bf5e7..6625ccf0f 100644 --- a/vllm/transformers_utils/configs/arctic.py +++ b/vllm/transformers_utils/configs/arctic.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # yapf: disable # ruff: noqa: E501 # coding=utf-8 diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py index e563bf626..43e9503ff 100644 --- a/vllm/transformers_utils/configs/chatglm.py +++ b/vllm/transformers_utils/configs/chatglm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/THUDM/ChatGLM2-6B from transformers import PretrainedConfig diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py index 1509330fc..e30409b3a 100644 --- a/vllm/transformers_utils/configs/cohere2.py +++ b/vllm/transformers_utils/configs/cohere2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # ruff: noqa # Adapted from diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py index 0dc966472..8f40b2b7d 100644 --- a/vllm/transformers_utils/configs/dbrx.py +++ b/vllm/transformers_utils/configs/dbrx.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # yapf: disable # ruff: noqa: E501 # coding=utf-8 diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py index 681528c3c..24d4052d8 100644 --- a/vllm/transformers_utils/configs/deepseek_vl2.py +++ b/vllm/transformers_utils/configs/deepseek_vl2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268 from typing import Tuple diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index b357a785e..b26aba666 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import Optional, Union diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py index f60a59f55..39364367e 100644 --- a/vllm/transformers_utils/configs/exaone.py +++ b/vllm/transformers_utils/configs/exaone.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copied from # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py # Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved. diff --git a/vllm/transformers_utils/configs/falcon.py b/vllm/transformers_utils/configs/falcon.py index c82cc6065..f161a06f3 100644 --- a/vllm/transformers_utils/configs/falcon.py +++ b/vllm/transformers_utils/configs/falcon.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py # Copyright 2023 The vLLM team. diff --git a/vllm/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py index b94c5b77e..48b5d79ff 100644 --- a/vllm/transformers_utils/configs/h2ovl.py +++ b/vllm/transformers_utils/configs/h2ovl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py # -------------------------------------------------------- diff --git a/vllm/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py index ac2492317..8ea62546e 100644 --- a/vllm/transformers_utils/configs/internvl.py +++ b/vllm/transformers_utils/configs/internvl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py # -------------------------------------------------------- diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py index 82f129eb2..0cab2c42e 100644 --- a/vllm/transformers_utils/configs/jais.py +++ b/vllm/transformers_utils/configs/jais.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Copyright 2023 Cerebras Systems. diff --git a/vllm/transformers_utils/configs/medusa.py b/vllm/transformers_utils/configs/medusa.py index d71a08343..885713c5d 100644 --- a/vllm/transformers_utils/configs/medusa.py +++ b/vllm/transformers_utils/configs/medusa.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import Optional, Union diff --git a/vllm/transformers_utils/configs/mllama.py b/vllm/transformers_utils/configs/mllama.py index 49e766d7f..eb77e09ad 100644 --- a/vllm/transformers_utils/configs/mllama.py +++ b/vllm/transformers_utils/configs/mllama.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from transformers.models.mllama import configuration_mllama as mllama_hf_config diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py index 946af4e91..c761f659e 100644 --- a/vllm/transformers_utils/configs/mlp_speculator.py +++ b/vllm/transformers_utils/configs/mlp_speculator.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional from transformers import PretrainedConfig diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py index 0f047c8b0..96356135f 100644 --- a/vllm/transformers_utils/configs/mpt.py +++ b/vllm/transformers_utils/configs/mpt.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copied from # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py """A HuggingFace-style model configuration.""" diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 1edf36329..fdf4fa2a5 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 HuggingFace Inc. team. All rights reserved. # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py index 8007176ae..300f6e211 100644 --- a/vllm/transformers_utils/configs/nvlm_d.py +++ b/vllm/transformers_utils/configs/nvlm_d.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py # -------------------------------------------------------- diff --git a/vllm/transformers_utils/configs/olmo2.py b/vllm/transformers_utils/configs/olmo2.py index 0e6d8e487..c6e446333 100644 --- a/vllm/transformers_utils/configs/olmo2.py +++ b/vllm/transformers_utils/configs/olmo2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # yapf: disable # ruff: noqa: E501 # coding=utf-8 diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py index 0c1c048f6..0d5db896b 100644 --- a/vllm/transformers_utils/configs/solar.py +++ b/vllm/transformers_utils/configs/solar.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py index eb6f5a059..5da6c5b44 100644 --- a/vllm/transformers_utils/configs/telechat2.py +++ b/vllm/transformers_utils/configs/telechat2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py """ Telechat configuration compatible with LlamaConfig. """ diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py index f724bf7f2..99715ba6d 100644 --- a/vllm/transformers_utils/configs/ultravox.py +++ b/vllm/transformers_utils/configs/ultravox.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py from typing import Any, Dict, Optional diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index 7c8423d2b..9d1d4bb92 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List, Optional from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams, diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py index 37ff8a236..8160a35ff 100644 --- a/vllm/transformers_utils/detokenizer_utils.py +++ b/vllm/transformers_utils/detokenizer_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple from .tokenizer import AnyTokenizer diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index b12cc83a2..3197b07d8 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import lru_cache from typing import Any, cast diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py index 9c71b8cad..4696f0c49 100644 --- a/vllm/transformers_utils/processors/__init__.py +++ b/vllm/transformers_utils/processors/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.transformers_utils.processors.deepseek_vl2 import ( DeepseekVLV2Processor) diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py index 27cdf6bc2..d37381ea9 100644 --- a/vllm/transformers_utils/processors/deepseek_vl2.py +++ b/vllm/transformers_utils/processors/deepseek_vl2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # yapf: disable # ruff: noqa: E501 # coding=utf-8 diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py index 74a56cbf5..4fe744d28 100644 --- a/vllm/transformers_utils/s3_utils.py +++ b/vllm/transformers_utils/s3_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import fnmatch import os import shutil diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 1f1d67fab..520870b56 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import contextlib import os import warnings diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py index 09569c564..c223768b1 100644 --- a/vllm/transformers_utils/tokenizer_group/__init__.py +++ b/vllm/transformers_utils/tokenizer_group/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Type from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig, diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py index e6cc7cd4e..fbdfa3e57 100644 --- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import List, Optional diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py index 3f7627e11..30cab752c 100644 --- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os from typing import List, Optional diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py index 6dc2f9056..025971cb7 100644 --- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional from vllm.config import TokenizerPoolConfig diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py index e68ad79b2..2b64f3fc7 100644 --- a/vllm/transformers_utils/tokenizers/__init__.py +++ b/vllm/transformers_utils/tokenizers/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .mistral import MistralTokenizer, maybe_serialize_tool_calls __all__ = ["MistralTokenizer", "maybe_serialize_tool_calls"] diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index d801cf4e4..cecafcc78 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import re from dataclasses import dataclass diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py index 10a09fb4f..71fe3ef0b 100644 --- a/vllm/transformers_utils/utils.py +++ b/vllm/transformers_utils/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from os import PathLike from pathlib import Path from typing import Union diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py index 568185383..c8f7a32ce 100644 --- a/vllm/triton_utils/__init__.py +++ b/vllm/triton_utils/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.triton_utils.importing import HAS_TRITON __all__ = ["HAS_TRITON"] diff --git a/vllm/triton_utils/custom_cache_manager.py b/vllm/triton_utils/custom_cache_manager.py index 17039d7ba..4163969c9 100644 --- a/vllm/triton_utils/custom_cache_manager.py +++ b/vllm/triton_utils/custom_cache_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from triton.runtime.cache import (FileCacheManager, default_cache_dir, diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py index 0c96e0632..a20700248 100644 --- a/vllm/triton_utils/importing.py +++ b/vllm/triton_utils/importing.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from importlib.util import find_spec from vllm.logger import init_logger diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 7f5cc9063..fbbb21c89 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import datetime import json import logging diff --git a/vllm/utils.py b/vllm/utils.py index 15481fb06..3089f0951 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import asyncio import concurrent diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index ce83b1fac..837d7faf4 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Attention layer with FlashAttention.""" from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 9d570b334..651bc01aa 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import TYPE_CHECKING, Dict, List, Set, Tuple from vllm.logger import init_logger diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 7176ec954..94086e4a1 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections import defaultdict from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 2b6557ad3..c801ab9e4 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """KV-Cache Utilities.""" from collections.abc import Sequence from dataclasses import dataclass diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 27c9ac1ae..f4738bb33 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections import deque from dataclasses import dataclass from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set, diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index abe4952c4..912b92862 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum from dataclasses import dataclass from typing import TYPE_CHECKING, List, Optional, Union diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index b9dc3561d..3c4e35e4a 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os from typing import AsyncGenerator, List, Mapping, Optional, Type, Union diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index f50303bda..29a9ac186 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pickle import queue import signal diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index f3b992d68..247380ef7 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os import signal diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 4a8b61bee..6d800f026 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import List, Optional, Union diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 55d314ebe..e0452bcad 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List, Mapping, Optional, Type, Union from typing_extensions import TypeVar diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py index d83460a40..83a0d9db1 100644 --- a/vllm/v1/engine/mm_input_mapper.py +++ b/vllm/v1/engine/mm_input_mapper.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional from vllm.config import ModelConfig diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 234ef8194..aeefd5239 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from dataclasses import dataclass from typing import Dict, List, Optional diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 6196c1105..366287951 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from typing import Mapping, Optional, Union diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 131be7598..ac10d43eb 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Type from vllm.config import VllmConfig diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index f6cf35da0..e3f07172d 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pickle import signal diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 6d5cc32ff..eddfb5949 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Dict, List diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index f901822c7..f736e38f1 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from abc import ABC, abstractmethod from typing import List diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 527750512..88f2c0835 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from dataclasses import dataclass from typing import TYPE_CHECKING, List diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index 32aee44e3..6e82bffd7 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Dict, List, Optional diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 80160c673..0519d9e78 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum from typing import TYPE_CHECKING, List, Optional, Union diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index d60f7eb5d..8e54de345 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Dict, List, Optional, Set diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py index 2796d0494..ba368b44a 100644 --- a/vllm/v1/sample/ops/penalties.py +++ b/vllm/v1/sample/ops/penalties.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Set, Tuple import torch diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index f2007d85c..27431001e 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict import torch diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 9ad665a64..3da7498e0 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A layer that samples the next tokens from the model's outputs.""" from typing import Tuple diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index b1cd5c118..1791dfa2b 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pickle diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py index 902800e05..09d382638 100644 --- a/vllm/v1/stats/common.py +++ b/vllm/v1/stats/common.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from dataclasses import dataclass from dataclasses import field as dataclass_field diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 8dfcf2dd7..5494542c1 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import multiprocessing import os import weakref diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 26a2084b1..8d0785243 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import numpy as np diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 28d8e3905..39708f833 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Datastructures defining an input batch from dataclasses import dataclass diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a00c00c30..0b5644525 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import gc import time from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, cast diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index a8cf0aec3..0adb69073 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A GPU worker class.""" import gc import os diff --git a/vllm/version.py b/vllm/version.py index 66e189dce..70cd0289b 100644 --- a/vllm/version.py +++ b/vllm/version.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + try: from ._version import __version__, __version_tuple__ except Exception as e: diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index c427b759b..252fe0660 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """CacheEngine class for managing the KV cache.""" from typing import List diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index fa6775cbd..71e32c5f7 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 4b429b67b..1c3feece9 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses import weakref from collections import defaultdict diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py index d31ba89e1..c0744d63b 100644 --- a/vllm/worker/cpu_pooling_model_runner.py +++ b/vllm/worker/cpu_pooling_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from typing import Any, Dict, List, Optional, Tuple, Type, Union diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 3e5fcf11b..27b1a2dd1 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A CPU worker class.""" from typing import Dict, List, Optional, Set, Tuple, Type diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 8a161b740..e2d338f75 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses import itertools from typing import Any, Dict, List, Optional, Tuple, Type, cast diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index a339c97a8..b846d4387 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index aaf9cb40b..a1f31bead 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 322d91d62..90f08b1df 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses import gc import inspect diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index aef4bdcdd..9e33ef9f1 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses import pickle from abc import ABC, abstractmethod diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 4aab09c80..90771e8ac 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses import functools from dataclasses import dataclass, field diff --git a/vllm/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py index e654f7172..387119998 100644 --- a/vllm/worker/multi_step_tpu_worker.py +++ b/vllm/worker/multi_step_tpu_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from typing import Dict, Optional, Tuple diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py index 1f982fe10..3518ab2f6 100644 --- a/vllm/worker/multi_step_worker.py +++ b/vllm/worker/multi_step_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from dataclasses import dataclass from typing import Dict, List, Optional, Tuple diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 596c26eac..f2093fc42 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from dataclasses import dataclass from importlib.util import find_spec diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index e02c72faa..5f0eb0019 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A Neuron worker class.""" from typing import List, Optional, Tuple diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index 42fe2cf66..44442cddb 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections import defaultdict from typing import Dict, List, NamedTuple, Optional, Tuple diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index f5b46cde3..0690222d9 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """An OpenVINO worker class.""" from typing import Any, Dict, List, Optional, Tuple diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index 6de227f3c..f43085b0e 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from typing import Any, Dict, List, Optional, Tuple, Type, Union diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 874951828..ecdf7aa88 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum import time from dataclasses import dataclass diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index ea0e70054..12f10169f 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import List, Optional, Tuple, Union diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index ffa8c4cb0..d925f0883 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 ''' Worker-related helper functions. ''' diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 1d2884d3d..582aa460e 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A GPU worker class.""" import gc import os diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 6eeb4aa17..819b81fbf 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses import os import time diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index b7b7b7227..9c726e1a1 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses import time import weakref diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index e9cb623c8..047c0bbbc 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A XPU worker class.""" import gc import os -- GitLab From e64330910b4e503e1a672a73c2bfbfc9b7305b86 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 3 Feb 2025 09:32:18 +0800 Subject: [PATCH 09/65] [doc][misc] clarify VLLM_HOST_IP for multi-node inference (#12667) As more and more people are trying deepseek models with multi-node inference, https://github.com/vllm-project/vllm/issues/7815 becomes more frequent. Let's give clear message to users. Signed-off-by: youkaichao --- docs/source/serving/distributed_serving.md | 12 +++++++++--- vllm/executor/ray_utils.py | 5 ++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index 3f9ca27eb..6d136147c 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -60,7 +60,8 @@ bash run_cluster.sh \ vllm/vllm-openai \ ip_of_head_node \ --head \ - /path/to/the/huggingface/home/in/this/node + /path/to/the/huggingface/home/in/this/node \ + -e VLLM_HOST_IP=ip_of_this_node ``` On the rest of the worker nodes, run the following command: @@ -70,10 +71,11 @@ bash run_cluster.sh \ vllm/vllm-openai \ ip_of_head_node \ --worker \ - /path/to/the/huggingface/home/in/this/node + /path/to/the/huggingface/home/in/this/node \ + -e VLLM_HOST_IP=ip_of_this_node ``` -Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. +Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. The IP addresses of each worker node should be specified in the `VLLM_HOST_IP` environment variable, and should be different for each worker node. Please check the network configuration of your cluster to make sure the nodes can communicate with each other through the specified IP addresses. Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. @@ -103,3 +105,7 @@ Please make sure you downloaded the model to all the nodes (with the same path), When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model. ::: + +:::{warning} +If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` to see the IP address used by Ray. See for more information. +::: diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 5d5cc8398..7b3015597 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -214,7 +214,10 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): logger.info( "Waiting for creating a placement group of specs for " "%d seconds. specs=%s. Check " - "`ray status` to see if you have enough resources.", + "`ray status` to see if you have enough resources," + " and make sure the IP addresses used by ray cluster" + " are the same as VLLM_HOST_IP environment variable" + " specified in each node if you are running on a multi-node.", int(time.time() - s), placement_group_specs) try: -- GitLab From 326fcc8b9f1d75a9f194581c98a7994d220c5255 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Sun, 2 Feb 2025 19:19:56 -0800 Subject: [PATCH 10/65] [Doc] Deprecate Discord (#12668) --- CODE_OF_CONDUCT.md | 2 +- README.md | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f801b5f8f..1a9596841 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -61,7 +61,7 @@ representative at an online or offline/IRL event. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement in the #code-of-conduct -channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g). +channel in the [vLLM Slack](https://slack.vllm.ai). All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the diff --git a/README.md b/README.md index 80c3ba7d1..09c2c6d35 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Easy, fast, and cheap LLM serving for everyone

-| Documentation | Blog | Paper | Discord | Twitter/X | Developer Slack | +| Documentation | Blog | Paper | Twitter/X | Developer Slack |

--- @@ -139,8 +139,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs ## Contact Us * For technical questions and feature requests, please use Github issues or discussions. -* For discussing with fellow users, please use Discord. -* For coordinating contributions and development, please use Slack. +* For discussing with fellow users and coordinating contributions and development, please use Slack. * For security disclosures, please use Github's security advisory feature. * For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu. -- GitLab From 95460fc51318702a33226b87152afe810187e01e Mon Sep 17 00:00:00 2001 From: Yang Chen Date: Sun, 2 Feb 2025 21:09:50 -0800 Subject: [PATCH 11/65] [Kernel] port sgl moe_align_block_size kernels (#12574) sgl_moe_align_block_size is based on: https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a moe_align_block_size is based on: https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0 Signed-off-by: Yang Chen --- csrc/moe/moe_align_sum_kernels.cu | 92 ++++++++++ csrc/moe/moe_ops.h | 6 + csrc/moe/torch_bindings.cpp | 9 + vllm/_custom_ops.py | 9 + vllm/envs.py | 9 +- .../layers/fused_moe/fused_moe.py | 162 +++++++++++++++++- 6 files changed, 284 insertions(+), 3 deletions(-) diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index 8b6fe72ad..ff74a42d7 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -197,6 +197,72 @@ __global__ void moe_align_block_size_global_mem_kernel( } } +// taken from +// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a +template +__global__ void sgl_moe_align_block_size_kernel( + scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids, + int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts, + int32_t block_size, size_t numel, int32_t* cumsum) { + __shared__ int32_t shared_counts[32][8]; + __shared__ int32_t local_offsets[256]; + + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + const int experts_per_warp = 8; + const int my_expert_start = warp_id * experts_per_warp; + + for (int i = 0; i < experts_per_warp; ++i) { + if (my_expert_start + i < num_experts) { + shared_counts[warp_id][i] = 0; + } + } + + const size_t tokens_per_thread = CEILDIV(numel, blockDim.x); + const size_t start_idx = threadIdx.x * tokens_per_thread; + + for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { + int expert_id = topk_ids[i]; + int warp_idx = expert_id / experts_per_warp; + int expert_offset = expert_id % experts_per_warp; + atomicAdd(&shared_counts[warp_idx][expert_offset], 1); + } + + __syncthreads(); + + if (threadIdx.x == 0) { + cumsum[0] = 0; + for (int i = 1; i <= num_experts; ++i) { + int expert_count = 0; + int warp_idx = (i - 1) / experts_per_warp; + int expert_offset = (i - 1) % experts_per_warp; + expert_count = shared_counts[warp_idx][expert_offset]; + + cumsum[i] = + cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size; + } + *total_tokens_post_pad = cumsum[num_experts]; + } + + __syncthreads(); + + if (threadIdx.x < num_experts) { + for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; + i += block_size) { + expert_ids[i / block_size] = threadIdx.x; + } + local_offsets[threadIdx.x] = cumsum[threadIdx.x]; + } + + __syncthreads(); + + for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { + int32_t expert_id = topk_ids[i]; + int32_t rank_post_pad = atomicAdd(&local_offsets[expert_id], 1); + sorted_token_ids[rank_post_pad] = i; + } +} + template __global__ void moe_sum_kernel( scalar_t* __restrict__ out, // [..., d] @@ -305,6 +371,32 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, } } +void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, + int64_t block_size, + torch::Tensor sorted_token_ids, + torch::Tensor experts_ids, + torch::Tensor num_tokens_post_pad) { + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + VLLM_DISPATCH_INTEGRAL_TYPES( + topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] { + // calc needed amount of shared mem for `tokens_cnts` and `cumsum` + // tensors + auto options_int = + torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device()); + // torch::Tensor token_cnts_buffer = + // torch::empty({(num_experts + 1) * num_experts}, options_int); + torch::Tensor cumsum_buffer = + torch::empty({num_experts + 1}, options_int); + + auto kernel = vllm::moe::sgl_moe_align_block_size_kernel; + kernel<<<1, 1024, 0, stream>>>( + topk_ids.data_ptr(), sorted_token_ids.data_ptr(), + experts_ids.data_ptr(), + num_tokens_post_pad.data_ptr(), num_experts, block_size, + topk_ids.numel(), cumsum_buffer.data_ptr()); + }); +} + void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size] torch::Tensor& output) // [num_tokens, hidden_size] { diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h index 596cc0aa6..66bb5f41b 100644 --- a/csrc/moe/moe_ops.h +++ b/csrc/moe/moe_ops.h @@ -12,3 +12,9 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t block_size, torch::Tensor sorted_token_ids, torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad); + +void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, + int64_t block_size, + torch::Tensor sorted_token_ids, + torch::Tensor experts_ids, + torch::Tensor num_tokens_post_pad); diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index f3a558c14..8540633dc 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -22,6 +22,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { " Tensor! num_tokens_post_pad) -> ()"); m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size); + // temporarily adapted from + // https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a + m.def( + "sgl_moe_align_block_size(Tensor topk_ids, int num_experts," + " int block_size, Tensor! sorted_token_ids," + " Tensor! experts_ids," + " Tensor! num_tokens_post_pad) -> ()"); + m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size); + #ifndef USE_ROCM m.def( "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, " diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index ce4f75341..bdc9a6a33 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -952,6 +952,15 @@ def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int, num_tokens_post_pad) +def sgl_moe_align_block_size(topk_ids: torch.Tensor, num_experts: int, + block_size: int, sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor) -> None: + torch.ops._moe_C.sgl_moe_align_block_size(topk_ids, num_experts, + block_size, sorted_token_ids, + experts_ids, num_tokens_post_pad) + + def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor, token_expert_indicies: torch.Tensor, gating_output: float) -> None: diff --git a/vllm/envs.py b/vllm/envs.py index 78ee3047b..5018f6deb 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -82,6 +82,7 @@ if TYPE_CHECKING: VLLM_MLA_DISABLE: bool = False VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True VLLM_MLA_DISABLE_REQUANTIZATION: bool = False + VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False def get_default_cache_root(): @@ -531,7 +532,13 @@ environment_variables: Dict[str, Callable[[], Any]] = { # matrices to match the activation type. This can lead to higher memory and # compute usage but better preserves the accuracy of the original model. "VLLM_MLA_DISABLE_REQUANTIZATION": - lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))) + lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))), + + # If set, vLLM will use the Triton implementation of moe_align_block_size, + # i.e. moe_align_block_size_triton in fused_moe.py. + "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON": + lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0")) + ), } # end-env-vars-definition diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 9613696a0..1bed35525 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -405,6 +405,144 @@ def fused_moe_kernel( tl.store(c_ptrs, accumulator, mask=c_mask) +def ceil_div(a, b): + return (a + b - 1) // b + + +@triton.jit +def moe_align_block_size_stage1( + topk_ids_ptr, + tokens_cnts_ptr, + num_experts: tl.constexpr, + numel: tl.constexpr, + tokens_per_thread: tl.constexpr, +): + pid = tl.program_id(0) + + start_idx = pid * tokens_per_thread + + off_c = (pid + 1) * num_experts + + for i in range(tokens_per_thread): + if start_idx + i < numel: + idx = tl.load(topk_ids_ptr + start_idx + i) + token_cnt = tl.load(tokens_cnts_ptr + off_c + idx) + tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1) + + +@triton.jit +def moe_align_block_size_stage2( + tokens_cnts_ptr, + num_experts: tl.constexpr, +): + pid = tl.program_id(0) + + last_cnt = 0 + for i in range(1, num_experts + 1): + token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid) + last_cnt = last_cnt + token_cnt + tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt) + + +@triton.jit +def moe_align_block_size_stage3( + total_tokens_post_pad_ptr, + tokens_cnts_ptr, + cumsum_ptr, + num_experts: tl.constexpr, + block_size: tl.constexpr, +): + last_cumsum = 0 + off_cnt = num_experts * num_experts + for i in range(1, num_experts + 1): + token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1) + last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size + tl.store(cumsum_ptr + i, last_cumsum) + tl.store(total_tokens_post_pad_ptr, last_cumsum) + + +@triton.jit +def moe_align_block_size_stage4( + topk_ids_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + tokens_cnts_ptr, + cumsum_ptr, + num_experts: tl.constexpr, + block_size: tl.constexpr, + numel: tl.constexpr, + tokens_per_thread: tl.constexpr, +): + pid = tl.program_id(0) + start_idx = tl.load(cumsum_ptr + pid) + end_idx = tl.load(cumsum_ptr + pid + 1) + + for i in range(start_idx, end_idx, block_size): + tl.store(expert_ids_ptr + i // block_size, pid) + + start_idx = pid * tokens_per_thread + off_t = pid * num_experts + + for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread, + numel)): + expert_id = tl.load(topk_ids_ptr + i) + token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id) + rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id) + tl.store(sorted_token_ids_ptr + rank_post_pad, i) + tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1) + + +# Triton implementation based on: +# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0 +def moe_align_block_size_triton( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + numel = topk_ids.numel() + grid = (num_experts, ) + tokens_cnts = torch.zeros((num_experts + 1, num_experts), + dtype=torch.int32, + device=topk_ids.device) + cumsum = torch.zeros((num_experts + 1, ), + dtype=torch.int32, + device=topk_ids.device) + tokens_per_thread = ceil_div(numel, num_experts) + + moe_align_block_size_stage1[grid]( + topk_ids, + tokens_cnts, + num_experts, + numel, + tokens_per_thread, + ) + moe_align_block_size_stage2[grid]( + tokens_cnts, + num_experts, + ) + moe_align_block_size_stage3[(1, )]( + num_tokens_post_pad, + tokens_cnts, + cumsum, + num_experts, + block_size, + ) + moe_align_block_size_stage4[grid]( + topk_ids, + sorted_token_ids, + expert_ids, + tokens_cnts, + cumsum, + num_experts, + block_size, + numel, + tokens_per_thread, + ) + + def moe_align_block_size( topk_ids: torch.Tensor, block_size: int, num_experts: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: @@ -457,8 +595,28 @@ def moe_align_block_size( num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device) - ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids, - expert_ids, num_tokens_post_pad) + if num_experts >= 224: + if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: + moe_align_block_size_triton( + topk_ids, + num_experts, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, + ) + else: + ops.sgl_moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, + ) + else: + ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids, + expert_ids, num_tokens_post_pad) return sorted_ids, expert_ids, num_tokens_post_pad -- GitLab From 20579c0fae1757c9da9fc35a69960563186f3036 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 3 Feb 2025 13:40:25 +0800 Subject: [PATCH 12/65] make sure mistral_common not imported for non-mistral models (#12669) When people use deepseek models, they find that they need to solve cv2 version conflict, see https://zhuanlan.zhihu.com/p/21064432691 . I added the check, and make all imports of `cv2` lazy. --------- Signed-off-by: youkaichao --- .buildkite/test-pipeline.yaml | 4 +-- ...{lazy_torch_compile.py => lazy_imports.py} | 19 ++++++++--- vllm/multimodal/video.py | 3 +- vllm/transformers_utils/tokenizers/mistral.py | 34 ++++++++++++------- 4 files changed, 40 insertions(+), 20 deletions(-) rename tests/standalone_tests/{lazy_torch_compile.py => lazy_imports.py} (56%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d5d02fdeb..05f3c3b31 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -50,9 +50,9 @@ steps: - tests/multimodal - tests/test_utils - tests/worker - - tests/standalone_tests/lazy_torch_compile.py + - tests/standalone_tests/lazy_imports.py commands: - - python3 standalone_tests/lazy_torch_compile.py + - python3 standalone_tests/lazy_imports.py - pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s async_engine # AsyncLLMEngine - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py diff --git a/tests/standalone_tests/lazy_torch_compile.py b/tests/standalone_tests/lazy_imports.py similarity index 56% rename from tests/standalone_tests/lazy_torch_compile.py rename to tests/standalone_tests/lazy_imports.py index b3b580952..61e3b3879 100644 --- a/tests/standalone_tests/lazy_torch_compile.py +++ b/tests/standalone_tests/lazy_imports.py @@ -8,7 +8,17 @@ from contextlib import nullcontext from vllm_test_utils import BlameResult, blame -module_name = "torch._inductor.async_compile" +# List of modules that should not be imported too early. +# Lazy import `torch._inductor.async_compile` to avoid creating +# too many processes before we set the number of compiler threads. +# Lazy import `cv2` to avoid bothering users who only use text models. +# `cv2` can easily mess up the environment. +module_names = ["torch._inductor.async_compile", "cv2"] + + +def any_module_imported(): + return any(module_name in sys.modules for module_name in module_names) + # In CI, we only check finally if the module is imported. # If it is indeed imported, we can rerun the test with `use_blame=True`, @@ -16,8 +26,7 @@ module_name = "torch._inductor.async_compile" # and help find the root cause. # We don't run it in CI by default because it is slow. use_blame = False -context = blame( - lambda: module_name in sys.modules) if use_blame else nullcontext() +context = blame(any_module_imported) if use_blame else nullcontext() with context as result: import vllm # noqa @@ -25,6 +34,6 @@ if use_blame: assert isinstance(result, BlameResult) print(f"the first import location is:\n{result.trace_stack}") -assert module_name not in sys.modules, ( - f"Module {module_name} is imported. To see the first" +assert not any_module_imported(), ( + f"Some the modules in {module_names} are imported. To see the first" f" import location, run the test with `use_blame=True`.") diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 88f184399..78a2918e3 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -6,7 +6,6 @@ from io import BytesIO from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, Optional -import cv2 import numpy as np import numpy.typing as npt from PIL import Image @@ -95,6 +94,8 @@ def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray: new_height, new_width = size resized_frames = np.empty((num_frames, new_height, new_width, channels), dtype=frames.dtype) + # lazy import cv2 to avoid bothering users who only use text models + import cv2 for i, frame in enumerate(frames): resized_frame = cv2.resize(frame, (new_width, new_height)) resized_frames[i] = resized_frame diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index cecafcc78..1550f978e 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -8,21 +8,18 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast import huggingface_hub from huggingface_hub import HfApi, hf_hub_download -from mistral_common.protocol.instruct.request import ChatCompletionRequest -from mistral_common.tokens.tokenizers.base import SpecialTokens -# yapf: disable -from mistral_common.tokens.tokenizers.mistral import ( - MistralTokenizer as PublicMistralTokenizer) -# yapf: enable -from mistral_common.tokens.tokenizers.sentencepiece import ( - SentencePieceTokenizer) -from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy, - Tekkenizer) from vllm.logger import init_logger from vllm.utils import is_list_of if TYPE_CHECKING: + # make sure `mistral_common` is lazy imported, + # so that users who only use non-mistral models + # will not be bothered by the dependency. + from mistral_common.protocol.instruct.request import ChatCompletionRequest + from mistral_common.tokens.tokenizers.mistral import ( + MistralTokenizer as PublicMistralTokenizer) + from vllm.entrypoints.chat_utils import ChatCompletionMessageParam logger = init_logger(__name__) @@ -33,7 +30,7 @@ class Encoding: input_ids: Union[List[int], List[List[int]]] -def maybe_serialize_tool_calls(request: ChatCompletionRequest): +def maybe_serialize_tool_calls(request: "ChatCompletionRequest"): # SEE: https://github.com/vllm-project/vllm/pull/9951 # Credits go to: @gcalmettes # NOTE: There is currently a bug in pydantic where attributes @@ -108,12 +105,16 @@ def find_tokenizer_file(files: List[str]): class MistralTokenizer: - def __init__(self, tokenizer: PublicMistralTokenizer) -> None: + def __init__(self, tokenizer: "PublicMistralTokenizer") -> None: self.mistral = tokenizer self.instruct = tokenizer.instruct_tokenizer tokenizer_ = tokenizer.instruct_tokenizer.tokenizer + from mistral_common.tokens.tokenizers.tekken import ( + SpecialTokenPolicy, Tekkenizer) self.is_tekken = isinstance(tokenizer_, Tekkenizer) + from mistral_common.tokens.tokenizers.sentencepiece import ( + SentencePieceTokenizer) self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer) if self.is_tekken: # Make sure special tokens will not raise @@ -153,6 +154,8 @@ class MistralTokenizer: assert Path( path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}" + from mistral_common.tokens.tokenizers.mistral import ( + MistralTokenizer as PublicMistralTokenizer) mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file) return cls(mistral_tokenizer) @@ -181,6 +184,8 @@ class MistralTokenizer: # by the guided structured output backends. @property def all_special_tokens_extended(self) -> List[str]: + from mistral_common.tokens.tokenizers.base import SpecialTokens + # tekken defines its own extended special tokens list if hasattr(self.tokenizer, "SPECIAL_TOKENS"): special_tokens = self.tokenizer.SPECIAL_TOKENS @@ -284,6 +289,8 @@ class MistralTokenizer: if last_message["role"] == "assistant": last_message["prefix"] = True + from mistral_common.protocol.instruct.request import ( + ChatCompletionRequest) request = ChatCompletionRequest(messages=messages, tools=tools) # type: ignore[type-var] encoded = self.mistral.encode_chat_completion(request) @@ -292,6 +299,7 @@ class MistralTokenizer: return encoded.tokens def convert_tokens_to_string(self, tokens: List[str]) -> str: + from mistral_common.tokens.tokenizers.base import SpecialTokens if self.is_tekken: tokens = [ t for t in tokens @@ -363,6 +371,8 @@ class MistralTokenizer: ids: List[int], skip_special_tokens: bool = True, ) -> List[str]: + from mistral_common.tokens.tokenizers.base import SpecialTokens + # TODO(Patrick) - potentially allow special tokens to not be skipped assert ( skip_special_tokens -- GitLab From c5932e5daceba8a5eefece98f1a769ddf84a864b Mon Sep 17 00:00:00 2001 From: Eldar Kurtic Date: Mon, 3 Feb 2025 06:42:18 +0100 Subject: [PATCH 13/65] Properly check if all fused layers are in the list of targets (#12666) Thanks @kylesayrs for catching this! --- .../layers/quantization/compressed_tensors/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index d700a0b15..4ea79531e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -236,7 +236,7 @@ def _match_fused_layer(layer_name: str, for type_suffix in possible_layer_types) if is_same_parent and is_matching_type and all( - '.'.join([parent_path, type_suffix]) + (f"{parent_path}.{type_suffix}" in target_layers) for type_suffix in possible_layer_types): return target -- GitLab From b9986454fe8ba80e2a109d069397b6b59aae658b Mon Sep 17 00:00:00 2001 From: Srikanth Srinivas Date: Sun, 2 Feb 2025 21:46:19 -0800 Subject: [PATCH 14/65] Fix for attention layers to remain unquantized during moe_wn16 quant (#12570) Fix to AWQ quant loading of the new R1 model The new optimized MoE kernels for a large number of experts `moe_wn16` uses AWQ quant which requires the attention layers to be in 16bit The current merge has broken this, and the `get_quant_method` must return None for it to work correctly again --------- Signed-off-by: Srikanth Srinivas Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Beim Signed-off-by: rshaw@neuralmagic.com Signed-off-by: mgoin Signed-off-by: npanpaliya Signed-off-by: Aleksandr Malyshev Signed-off-by: Lucas Wilkinson Signed-off-by: simon-mo Signed-off-by: Cody Yu Signed-off-by: Chen Zhang Signed-off-by: Tyler Michael Smith Signed-off-by: Ryan N Signed-off-by: Brian Dellabetta Signed-off-by: Jee Jee Li Signed-off-by: Rahul Tuli Signed-off-by: Russell Bryant Signed-off-by: simon-mo Signed-off-by: Vicente Herrera Signed-off-by: Jinzhen Lin Signed-off-by: Woosuk Kwon Signed-off-by: Shawn Du Signed-off-by: Kunshang Ji Signed-off-by: youkaichao Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Beim <805908499@qq.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: mgoin Co-authored-by: simon-mo Co-authored-by: Nishidha Co-authored-by: Lucas Wilkinson Co-authored-by: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev Co-authored-by: Woosuk Kwon Co-authored-by: simon-mo Co-authored-by: Michael Goin Co-authored-by: Zhuohan Li Co-authored-by: Tyler Michael Smith Co-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Co-authored-by: Cody Yu Co-authored-by: Chen Zhang Co-authored-by: Kevin H. Luu Co-authored-by: Tyler Michael Smith Co-authored-by: Ryan Nguyen <96593302+xpbowler@users.noreply.github.com> Co-authored-by: Brian Dellabetta Co-authored-by: fade_away <1028552010@qq.com> Co-authored-by: weilong.yu Co-authored-by: Jee Jee Li Co-authored-by: Eldar Kurtic Co-authored-by: Rahul Tuli Co-authored-by: Russell Bryant Co-authored-by: Vicente Herrera Co-authored-by: Jinzhen Lin Co-authored-by: Shawn Du Co-authored-by: Kunshang Ji Co-authored-by: youkaichao --- vllm/model_executor/layers/quantization/moe_wna16.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 1ae765a22..56fa597e2 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -7,7 +7,8 @@ import torch from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) -from vllm.model_executor.layers.linear import UnquantizedLinearMethod +from vllm.model_executor.layers.linear import (LinearBase, + UnquantizedLinearMethod) from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig from vllm.model_executor.layers.quantization.base_config import ( @@ -125,9 +126,7 @@ class MoeWNA16Config(QuantizationConfig): prefix: str) -> Optional["QuantizeMethodBase"]: if is_layer_skipped_quant(prefix, self.modules_to_not_convert): return UnquantizedLinearMethod() - elif isinstance(layer, FusedMoE): - return MoeWNA16Method(self) - else: + elif isinstance(layer, LinearBase): if self.linear_quant_method == "gptq": if self.use_marlin: return GPTQMarlinConfig.from_config( @@ -144,6 +143,9 @@ class MoeWNA16Config(QuantizationConfig): self.full_config).get_quant_method(layer, prefix) else: raise ValueError("moe_wna16 only support gptq and awq.") + elif isinstance(layer, FusedMoE): + return MoeWNA16Method(self) + return None def is_layer_skipped_quant(prefix: str, modules_to_not_convert: List[str]): -- GitLab From ad4a9dc817f00c266d1ca210342d1865aa69db27 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 3 Feb 2025 15:58:21 +0800 Subject: [PATCH 15/65] [cuda] manually import the correct pynvml module (#12679) fixes problems like https://github.com/vllm-project/vllm/pull/12635 and https://github.com/vllm-project/vllm/pull/12636 and https://github.com/vllm-project/vllm/pull/12565 --------- Signed-off-by: youkaichao --- vllm/platforms/__init__.py | 3 ++- vllm/platforms/cuda.py | 10 ++------ vllm/utils.py | 52 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 9 deletions(-) diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index d34b660df..9c98942b5 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -33,7 +33,8 @@ def cuda_platform_plugin() -> Optional[str]: is_cuda = False try: - import pynvml + from vllm.utils import import_pynvml + pynvml = import_pynvml() pynvml.nvmlInit() try: if pynvml.nvmlDeviceGetCount() > 0: diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 44d2506f0..b49852a72 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -8,7 +8,6 @@ from functools import lru_cache, wraps from typing import (TYPE_CHECKING, Callable, List, Optional, Tuple, TypeVar, Union) -import pynvml import torch from typing_extensions import ParamSpec @@ -16,6 +15,7 @@ from typing_extensions import ParamSpec import vllm._C # noqa import vllm.envs as envs from vllm.logger import init_logger +from vllm.utils import import_pynvml from .interface import DeviceCapability, Platform, PlatformEnum, _Backend @@ -29,13 +29,7 @@ logger = init_logger(__name__) _P = ParamSpec("_P") _R = TypeVar("_R") -if pynvml.__file__.endswith("__init__.py"): - logger.warning( - "You are using a deprecated `pynvml` package. Please install" - " `nvidia-ml-py` instead, and make sure to uninstall `pynvml`." - " When both of them are installed, `pynvml` will take precedence" - " and cause errors. See https://pypi.org/project/pynvml " - "for more information.") +pynvml = import_pynvml() # pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models # see https://github.com/huggingface/diffusers/issues/9704 for details diff --git a/vllm/utils.py b/vllm/utils.py index 3089f0951..a2b53fcf2 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -2208,3 +2208,55 @@ def run_method(obj: Any, method: Union[str, bytes, Callable], args: Tuple[Any], else: func = partial(method, obj) # type: ignore return func(*args, **kwargs) + + +def import_pynvml(): + """ + Historical comments: + + libnvml.so is the library behind nvidia-smi, and + pynvml is a Python wrapper around it. We use it to get GPU + status without initializing CUDA context in the current process. + Historically, there are two packages that provide pynvml: + - `nvidia-ml-py` (https://pypi.org/project/nvidia-ml-py/): The official + wrapper. It is a dependency of vLLM, and is installed when users + install vLLM. It provides a Python module named `pynvml`. + - `pynvml` (https://pypi.org/project/pynvml/): An unofficial wrapper. + Prior to version 12.0, it also provides a Python module `pynvml`, + and therefore conflicts with the official one. What's worse, + the module is a Python package, and has higher priority than + the official one which is a standalone Python file. + This causes errors when both of them are installed. + Starting from version 12.0, it migrates to a new module + named `pynvml_utils` to avoid the conflict. + + TL;DR: if users have pynvml<12.0 installed, it will cause problems. + Otherwise, `import pynvml` will import the correct module. + We take the safest approach here, to manually import the correct + `pynvml.py` module from the `nvidia-ml-py` package. + """ + if TYPE_CHECKING: + import pynvml + return pynvml + if "pynvml" in sys.modules: + import pynvml + if pynvml.__file__.endswith("__init__.py"): + # this is pynvml < 12.0 + raise RuntimeError( + "You are using a deprecated `pynvml` package. " + "Please uninstall `pynvml` or upgrade to at least" + " version 12.0. See https://pypi.org/project/pynvml " + "for more information.") + return sys.modules["pynvml"] + import importlib.util + import os + import site + for site_dir in site.getsitepackages(): + pynvml_path = os.path.join(site_dir, "pynvml.py") + if os.path.exists(pynvml_path): + spec = importlib.util.spec_from_file_location( + "pynvml", pynvml_path) + pynvml = importlib.util.module_from_spec(spec) + sys.modules["pynvml"] = pynvml + spec.loader.exec_module(pynvml) + return pynvml -- GitLab From 1298a400e8e8496b6e9ce3847ada5ec9f6e6cb48 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 3 Feb 2025 15:59:49 +0800 Subject: [PATCH 16/65] [ci/build] fix gh200 test (#12681) Signed-off-by: youkaichao --- .buildkite/check-wheel-size.py | 4 ++-- .buildkite/run-gh200-test.sh | 4 ++-- Dockerfile | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index 2e4aecdd3..a378bc6ba 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -4,11 +4,11 @@ import os import sys import zipfile -# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB +# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB # Note that we have 400 MiB quota, please use it wisely. # See https://github.com/pypi/support/issues/3792 . # Please also sync the value with the one in Dockerfile. -VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300)) +VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400)) def print_top_10_largest_files(zip_file): diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index 3e4e40946..99972afa2 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -23,6 +23,6 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and test offline inference -docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' - python3 examples/offline_inference/basic.py +docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' + python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B ' diff --git a/Dockerfile b/Dockerfile index 0b9f74e08..7ecb643f4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -127,7 +127,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py -ARG VLLM_MAX_SIZE_MB=300 +ARG VLLM_MAX_SIZE_MB=400 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB ARG RUN_WHEEL_CHECK=true RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ -- GitLab From a1a2aaadb9122f05667140e39cf67e5736c8b6d6 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Mon, 3 Feb 2025 14:30:38 +0100 Subject: [PATCH 17/65] [Model]: Add `transformers` backend support (#11330) # Adds support for `transformers` as a backend Following https://github.com/huggingface/transformers/pull/35235, a bunch of models should already be supported, we are ramping up support for more models. Thanks @Isotr0py for the TP support, and @hmellor for his help as well! This includes: - `trust_remote_code=True` support: any model on the hub, if it implements attention the correct way can be natively supported!! - tensor parallel support --------- Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung Co-authored-by: Michael Goin Co-authored-by: Isotr0py --- .buildkite/test-pipeline.yaml | 2 + docs/source/models/supported_models.md | 76 ++++++ requirements-common.txt | 2 +- tests/models/registry.py | 5 + tests/models/test_oot_registration.py | 4 +- tests/models/test_transformers.py | 75 ++++++ vllm/config.py | 14 ++ vllm/engine/arg_utils.py | 22 +- vllm/model_executor/model_loader/utils.py | 61 ++++- vllm/model_executor/models/registry.py | 12 +- vllm/model_executor/models/transformers.py | 264 +++++++++++++++++++++ 11 files changed, 528 insertions(+), 9 deletions(-) create mode 100644 tests/models/test_transformers.py create mode 100644 vllm/model_executor/models/transformers.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 05f3c3b31..a847a68a6 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -349,6 +349,7 @@ steps: - vllm/ - tests/models commands: + - pytest -v -s models/test_transformers.py - pytest -v -s models/test_registry.py - pytest -v -s models/test_initialization.py @@ -485,6 +486,7 @@ steps: - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' # Avoid importing model tests that cause CUDA reinitialization error + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)' - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index afaad8818..4a0996469 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -40,6 +40,82 @@ If vLLM successfully returns text (for generative models) or hidden states (for Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM. Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. +### Transformers fallback + +After the merge of , `vllm` can fallback to models that are available in `transformers`. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned! + +To check if the backend is `transformers`, you can simply do this: + +```python +from vllm import LLM +llm = LLM(model=..., task="generate") # Name or path of your model +llm.apply_model(lambda model: print(model.__class__)) +``` + +If it is `TransformersModel` then it means it's based on `transformers`! + +#### Supported features + +##### LORA and quantization + +Both are not supported yet! Make sure to open an issue and we'll work on this together with the `transformers` team! + +Usually `transformers` model load weights via the `load_adapters` API, that depends on PEFT. We need to work a bit to either use this api (for now this would result in some weights not being marked as loaded) or replace modules accordingly. + +Hints as to how this would look like: + +```python +class TransformersModel(nn.Module, SupportsLoRA): + def __init__(*): + ... + self.model.load_adapter(vllm_config.load_config.model_loader_extra_config["qlora_adapter_name_or_path"]) +``` + +Blocker is that you need to specify supported lora layers, when we would ideally want to load whatever is inside the checkpoint! + +##### Remote code + +This fallback also means that any model on the hub that can be used in `transformers` with `trust_remote_code=True` that correctly implements attention can be used in production! + +```python +from vllm import LLM +llm = LLM(model=..., task="generate", trust_remote_code=True) # Name or path of your model +llm.apply_model(lambda model: print(model.__class__)) +``` + +A model just needs the following two things: + +```python +from transformers import PreTrainedModel +from torch import nn + +class MyAttention(nn.Module): + + def forward(self, hidden_states, **kwargs): # <- kwargs are required + + ... + attention_interface = attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + **kwargs, + ) + ... + +class MyModel(PreTrainedModel): + _supports_attention_backend = True +``` + +Here is what happens in the background: + +1. The config is loaded +2. `MyModel` python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`. +3. The `TransformersModel` backend is used. See `/model_executors/models/transformers`, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`. + +That's it! + ### ModelScope To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable: diff --git a/requirements-common.txt b/requirements-common.txt index e5248572c..97e33a6db 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -5,7 +5,7 @@ requests >= 2.26.0 tqdm blake3 py-cpuinfo -transformers >= 4.48.2 # Required for Bamba. +transformers >= 4.48.2 # Required for Bamba model and Transformers backend. tokenizers >= 0.19.1 # Required for Llama 3. protobuf # Required by LlamaTokenizer. fastapi >= 0.107.0, < 0.113.0; python_version < '3.9' diff --git a/tests/models/registry.py b/tests/models/registry.py index d0dbbf00e..8a0ade4fa 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -281,12 +281,17 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { speculative_model="ibm-fms/llama-160m-accelerator"), # noqa: E501 } +_FALLBACK_MODEL = { + "TransformersModel": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501 +} + _EXAMPLE_MODELS = { **_TEXT_GENERATION_EXAMPLE_MODELS, **_EMBEDDING_EXAMPLE_MODELS, **_CROSS_ENCODER_EXAMPLE_MODELS, **_MULTIMODAL_EXAMPLE_MODELS, **_SPECULATIVE_DECODING_EXAMPLE_MODELS, + **_FALLBACK_MODEL, } diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index ef665baa1..f2a505596 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -15,7 +15,9 @@ def test_plugin(dummy_opt_path): os.environ["VLLM_PLUGINS"] = "" with pytest.raises(Exception) as excinfo: LLM(model=dummy_opt_path, load_format="dummy") - assert "are not supported for now" in str(excinfo.value) + error_msg = "has no vLLM implementation and " \ + "the Transformers implementation is not compatible with vLLM." + assert (error_msg in str(excinfo.value)) @fork_new_process_for_each_test diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py new file mode 100644 index 000000000..c6536f37c --- /dev/null +++ b/tests/models/test_transformers.py @@ -0,0 +1,75 @@ +"""Test the functionality of the Transformers backend. + +Run `pytest tests/models/test_transformers.py`. +""" +from contextlib import nullcontext +from typing import Type + +import pytest + +from ..conftest import HfRunner, VllmRunner +from ..utils import multi_gpu_test +from .utils import check_logprobs_close + + +def check_implementation( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + example_prompts: list[str], + model: str, + **kwargs, +): + max_tokens = 32 + num_logprobs = 5 + + with vllm_runner(model, **kwargs) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + + with hf_runner(model) as hf_model: + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize( + "model,model_impl", + [ + ("meta-llama/Llama-3.2-1B-Instruct", "transformers"), + ("openai-community/gpt2", "transformers"), + ("ArthurZ/Ilama-3.2-1B", "auto"), # CUSTOM CODE + ("meta-llama/Llama-3.2-1B-Instruct", "auto"), + ]) # trust_remote_code=True by default +def test_models(hf_runner, vllm_runner, example_prompts, model, + model_impl) -> None: + + maybe_raises = nullcontext() + if model == "openai-community/gpt2" and model_impl == "transformers": + # Model is not backend compatible + maybe_raises = pytest.raises( + ValueError, + match="The Transformers implementation.*not compatible with vLLM") + + with maybe_raises: + check_implementation(hf_runner, + vllm_runner, + example_prompts, + model, + model_impl=model_impl) + + +@multi_gpu_test(num_gpus=2) +def test_distributed( + hf_runner, + vllm_runner, + example_prompts, +): + kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2} + check_implementation(hf_runner, vllm_runner, example_prompts, + "meta-llama/Llama-3.2-1B-Instruct", **kwargs) diff --git a/vllm/config.py b/vllm/config.py index d2d59c705..d70a63795 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -83,6 +83,12 @@ class SupportsHash(Protocol): ... +class ModelImpl(str, enum.Enum): + AUTO = "auto" + VLLM = "vllm" + TRANSFORMERS = "transformers" + + class ModelConfig: """Configuration for the model. @@ -167,6 +173,12 @@ class ModelConfig: `logits_processors` extra completion argument. Defaults to None, which allows no processors. generation_config: Configuration parameter file for generation. + model_impl: Which implementation of the model to use: + "auto" will try to use the vLLM implementation if it exists and + fall back to the Transformers implementation if no vLLM + implementation is available. + "vllm" will use the vLLM model implementation. + "transformers" will use the Transformers model implementation. override_generation_config: Override the generation config with the given config. """ @@ -230,6 +242,7 @@ class ModelConfig: generation_config: Optional[str] = None, enable_sleep_mode: bool = False, override_generation_config: Optional[Dict[str, Any]] = None, + model_impl: Union[str, ModelImpl] = ModelImpl.AUTO, ) -> None: self.model = model self.tokenizer = tokenizer @@ -241,6 +254,7 @@ class ModelConfig: self.code_revision = code_revision self.rope_scaling = rope_scaling self.rope_theta = rope_theta + self.model_impl = model_impl if hf_overrides is None: hf_overrides = {} diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7c0e8c214..40c6fb456 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -13,10 +13,10 @@ import vllm.envs as envs from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat, DecodingConfig, DeviceConfig, HfOverrides, KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PoolerConfig, PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig, TaskOption, TokenizerPoolConfig, - VllmConfig) + ModelConfig, ModelImpl, ObservabilityConfig, + ParallelConfig, PoolerConfig, PromptAdapterConfig, + SchedulerConfig, SpeculativeConfig, TaskOption, + TokenizerPoolConfig, VllmConfig) from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS @@ -199,6 +199,7 @@ class EngineArgs: generation_config: Optional[str] = None override_generation_config: Optional[Dict[str, Any]] = None enable_sleep_mode: bool = False + model_impl: str = "auto" calculate_kv_scales: Optional[bool] = None @@ -378,6 +379,18 @@ class EngineArgs: 'qualified names that can be passed with the `logits_processors` ' 'extra completion argument. Defaults to None, which allows no ' 'processors.') + parser.add_argument( + '--model-impl', + type=str, + default=EngineArgs.model_impl, + choices=[f.value for f in ModelImpl], + help='Which implementation of the model to use.\n\n' + '* "auto" will try to use the vLLM implementation if it exists ' + 'and fall back to the Transformers implementation if no vLLM ' + 'implementation is available.\n' + '* "vllm" will use the vLLM model implementation.\n' + '* "transformers" will use the Transformers model ' + 'implementation.\n') # Parallel arguments parser.add_argument( '--distributed-executor-backend', @@ -1017,6 +1030,7 @@ class EngineArgs: generation_config=self.generation_config, override_generation_config=self.override_generation_config, enable_sleep_mode=self.enable_sleep_mode, + model_impl=self.model_impl, ) def create_load_config(self) -> LoadConfig: diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 084ca53b1..eb334c1fd 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -2,17 +2,22 @@ """Utilities for selecting and loading models.""" import contextlib from dataclasses import dataclass, field -from typing import Dict, List, Tuple, Type +from typing import Dict, List, Optional, Tuple, Type import torch +import transformers from torch import nn +from transformers.dynamic_module_utils import get_class_from_dynamic_module -from vllm.config import ModelConfig +from vllm.config import ModelConfig, ModelImpl +from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry from vllm.model_executor.models.adapters import (as_classification_model, as_embedding_model, as_reward_model) +logger = init_logger(__name__) + @contextlib.contextmanager def set_default_torch_dtype(dtype: torch.dtype): @@ -23,6 +28,50 @@ def set_default_torch_dtype(dtype: torch.dtype): torch.set_default_dtype(old_dtype) +def is_transformers_impl_compatible( + arch: str, + module: Optional[transformers.PreTrainedModel] = None) -> bool: + mod = module or getattr(transformers, arch, None) + if mod is None: + return False + if hasattr(mod, "supports_backend"): + return mod.is_backend_compatible() + else: + return mod._supports_flex_attn + + +def resolve_transformers_fallback(model_config: ModelConfig, + architectures: list[str]): + for i, arch in enumerate(architectures): + if arch == "TransformersModel": + continue + custom_module = None + auto_map = getattr(model_config.hf_config, "auto_map", None) + if auto_map is not None and "AutoModel" in auto_map: + custom_module = get_class_from_dynamic_module( + model_config.hf_config.auto_map["AutoModel"], + model_config.model) + # TODO(Isotr0py): Further clean up these raises. + # perhaps handled them in _ModelRegistry._raise_for_unsupported? + if model_config.model_impl == ModelImpl.TRANSFORMERS: + if not is_transformers_impl_compatible(arch, custom_module): + raise ValueError( + f"The Transformers implementation of {arch} is not " + "compatible with vLLM.") + architectures[i] = "TransformersModel" + if model_config.model_impl == ModelImpl.AUTO: + if not is_transformers_impl_compatible(arch, custom_module): + raise ValueError( + f"{arch} has no vLLM implementation and the Transformers " + "implementation is not compatible with vLLM.") + logger.warning( + "%s has no vLLM implementation, falling back to Transformers " + "implementation. Some features may not be supported and " + "performance may not be optimal.", arch) + architectures[i] = "TransformersModel" + return architectures + + def get_model_architecture( model_config: ModelConfig) -> Tuple[Type[nn.Module], str]: architectures = getattr(model_config.hf_config, "architectures", []) @@ -38,6 +87,14 @@ def get_model_architecture( and "MixtralForCausalLM" in architectures): architectures = ["QuantMixtralForCausalLM"] + vllm_supported_archs = ModelRegistry.get_supported_archs() + is_vllm_supported = any(arch in vllm_supported_archs + for arch in architectures) + if (not is_vllm_supported + or model_config.model_impl == ModelImpl.TRANSFORMERS): + architectures = resolve_transformers_fallback(model_config, + architectures) + model_cls, arch = ModelRegistry.resolve_model_cls(architectures) if model_config.task == "embed": model_cls = as_embedding_model(model_cls) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 40bbc7d16..962f95f10 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -184,6 +184,10 @@ _SPECULATIVE_DECODING_MODELS = { "MedusaModel": ("medusa", "Medusa"), "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"), } + +_FALLBACK_MODEL = { + "TransformersModel": ("transformers", "TransformersModel"), +} # yapf: enable _VLLM_MODELS = { @@ -192,6 +196,7 @@ _VLLM_MODELS = { **_CROSS_ENCODER_MODELS, **_MULTIMODAL_MODELS, **_SPECULATIVE_DECODING_MODELS, + **_FALLBACK_MODEL, } @@ -378,7 +383,12 @@ class _ModelRegistry: if not architectures: logger.warning("No model architectures are specified") - return architectures + normalized_arch = [] + for model in architectures: + if model not in self.models: + model = "TransformersModel" + normalized_arch.append(model) + return normalized_arch def inspect_model_cls( self, diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py new file mode 100644 index 000000000..ff1ae0ac8 --- /dev/null +++ b/vllm/model_executor/models/transformers.py @@ -0,0 +1,264 @@ +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Wrapper around `transformers` models""" +import re +from typing import Iterable, List, Optional, Set, Tuple, Union + +import torch +from torch import nn +from transformers import AutoModel, PreTrainedModel +from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS + +from vllm.attention import Attention, AttentionMetadata +from vllm.config import VllmConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed.utils import divide +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .utils import maybe_prefix + +logger = init_logger(__name__) + + +def vllm_flash_attention_forward( + # Transformers args + module: torch.nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: torch.Tensor, + # Transformers kwargs + scaling: float = None, + # vLLM kwargs + attn_metadata: AttentionMetadata = None, + attention_instances: list[Attention] = None, + **kwargs): + self_attn = attention_instances[module.layer_idx] + if scaling is not None: + self_attn.impl.scale = float(scaling) + hidden = query.shape[-2] + query, key, value = (x.transpose(1, 2) for x in (query, key, value)) + query, key, value = (x.reshape(hidden, -1) for x in (query, key, value)) + return self_attn.forward( + query, + key, + value, + kv_cache=None, # argument not used + attn_metadata=attn_metadata), None + + +ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward + + +# Linear Layer that is compatible with transformers internal forward +# TODO: This is a temporary solution, we should find a better way to integrate +class HFColumnParallelLinear(ColumnParallelLinear): + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return super().forward(input)[0] + + +class HFRowParallelLinear(RowParallelLinear): + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return super().forward(input)[0] + + +def replace_tp_linear_class(orig_module: nn.Linear, + style: str, + quant_config=None): + """ + In model configurations, we use a neutral type (string) to specify parallel + styles, here we use it to translate nn.Linear into vllm-style tp Linear. + + Quant config is not supported yet + """ + + if not isinstance(style, str): + raise ValueError( + f"Unsupported parallel style type {type(style)}, expected str") + + input_size = orig_module.in_features + output_size = orig_module.out_features + bias = orig_module.bias is not None + + if style == "colwise": + return HFColumnParallelLinear( + input_size, + output_size, + bias, + ) + elif style == "rowwise": + return HFRowParallelLinear( + input_size, + output_size, + bias, + ) + # We don't consider colwise_rep since it's used in lm_head + else: + raise ValueError(f"Unsupported parallel style value: {style}") + + +class TransformersModel(nn.Module): + embedding_padding_modules = ["lm_head"] + embedding_modules = ["embed_tokens" + ] # TODO transformers will have a util to get it + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + logger.info("Using Transformers backend.") + + self.vllm_config = vllm_config + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.quant_config = quant_config + self.config = config + self.vocab_size = config.vocab_size + self.unpadded_vocab_size = config.vocab_size + + self.model: PreTrainedModel = AutoModel.from_config( + self.config, + attn_implementation="vllm", + torch_dtype=vllm_config.model_config.dtype, + trust_remote_code=vllm_config.model_config.trust_remote_code, + ) + prefix = self.model.base_model_prefix + + # MLP modifications + self.tensor_parallelize(self.model) + + # Attention modifications (assumes 1 attention op per hidden layer) + tp_size = get_tensor_model_parallel_world_size() + self.attention_instances = [ + Attention( + num_heads=divide(config.num_attention_heads, tp_size), + head_size=config.head_dim, + # NOTE: We use Llama scale as default, if it's set by + # Transformers, it's updated in vllm_flash_attention_forward + scale=config.head_dim**-0.5, + num_kv_heads=divide(config.num_key_value_heads, tp_size), + cache_config=cache_config, + quant_config=None, + prefix=f"{i}.attn") for i in range(config.num_hidden_layers) + ] + + # Model modifications + self.replace_vocab_embed_class(self.model) + + # ForCausalLM modifications + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=None, + prefix=maybe_prefix(prefix, "lm_head")) + if config.tie_word_embeddings: + self.lm_head.weight = self.model.get_input_embeddings().weight + + logit_scale = getattr(config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size, logit_scale) + self.sampler = get_sampler() + + def log_replacement(self, name: str, old_module: nn.Module, + new_module: nn.Module): + logger.debug("%s: %s -> %s", name, old_module, new_module) + + def tensor_parallelize(self, module: nn.Module, prefix: str = ""): + if (self.config.base_model_tp_plan is None + and self.vllm_config.parallel_config.tensor_parallel_size > 1): + raise ValueError( + "Trying to run tensor parallelization but the model does not " + "support it yet!") + + for child_name, child_module in module.named_children(): + qual_name = prefix + child_name + for pattern, style in self.config.base_model_tp_plan.items(): + if re.match(pattern, qual_name) and isinstance( + child_module, nn.Linear): + new_module = replace_tp_linear_class( + child_module, style, self.quant_config) + setattr(module, child_name, new_module) + self.log_replacement(qual_name, child_module, new_module) + else: + self.tensor_parallelize(child_module, prefix=f"{qual_name}.") + + def replace_vocab_embed_class(self, module: nn.Module): + # Use native set input embeddings + new_module = VocabParallelEmbedding( + self.vocab_size, + self.config.hidden_size, + org_num_embeddings=self.config.vocab_size, + quant_config=None, + ) + self.log_replacement("input embedding", + self.model.get_input_embeddings(), new_module) + self.model.set_input_embeddings(new_module) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], # argument not used + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + model_output = self.model( + input_ids[None, ...], + use_cache=False, + position_ids=positions[None, ...], + attn_metadata=attn_metadata, + intermediate_tensors=intermediate_tensors, + attention_instances=self.attention_instances, + return_dict=False)[0][0, ...] # we remove batch dimension for now + return model_output + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample(self, logits: torch.Tensor, + sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]: + + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + for name, loaded_weight in weights: + if name not in params_dict: + name = f"{self.model.base_model_prefix}.{name}" + if name in params_dict: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params -- GitLab From 33e0602e59cfb37ab0bfdff5ea6802aeb3a3ecc9 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Mon, 3 Feb 2025 14:16:59 -0500 Subject: [PATCH 18/65] [Misc] Fix improper placement of SPDX header in scripts (#12694) Signed-off-by: Russell Bryant --- cmake/hipify.py | 3 +-- tests/models/test_transformers.py | 1 + tools/check_spdx_header.py | 17 ++++++++++++----- tools/report_build_time_ninja.py | 2 +- vllm/attention/ops/triton_flash_attention.py | 3 +-- vllm/model_executor/models/transformers.py | 1 + 6 files changed, 17 insertions(+), 10 deletions(-) diff --git a/cmake/hipify.py b/cmake/hipify.py index 2e0c8a172..a15577125 100755 --- a/cmake/hipify.py +++ b/cmake/hipify.py @@ -1,6 +1,5 @@ -# SPDX-License-Identifier: Apache-2.0 - #!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 # # A command line tool for running pytorch's hipify preprocessor on CUDA diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index c6536f37c..1d5d9729d 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test the functionality of the Transformers backend. Run `pytest tests/models/test_transformers.py`. diff --git a/tools/check_spdx_header.py b/tools/check_spdx_header.py index 3f7fd66bf..709befc53 100644 --- a/tools/check_spdx_header.py +++ b/tools/check_spdx_header.py @@ -10,18 +10,25 @@ def check_spdx_header(file_path): with open(file_path, encoding='UTF-8') as file: lines = file.readlines() if not lines: - # not necessary for an empty file like __init__.py + # Empty file like __init__.py return True - if not lines[0].strip().startswith(SPDX_HEADER_PREFIX): - return False - return True + for line in lines: + if line.strip().startswith(SPDX_HEADER_PREFIX): + return True + return False def add_header(file_path): with open(file_path, 'r+', encoding='UTF-8') as file: lines = file.readlines() file.seek(0, 0) - file.write(SPDX_HEADER + '\n\n' + ''.join(lines)) + if lines and lines[0].startswith("#!"): + file.write(lines[0]) + file.write(SPDX_HEADER + '\n') + file.writelines(lines[1:]) + else: + file.write(SPDX_HEADER + '\n') + file.writelines(lines) def main(): diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py index 33e85b9ff..011af2522 100644 --- a/tools/report_build_time_ninja.py +++ b/tools/report_build_time_ninja.py @@ -1,6 +1,6 @@ +#!/usr/bin/env python3 # SPDX-License-Identifier: Apache-2.0 -#!/usr/bin/env python3 # Copyright (c) 2018 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index ab8fb8953..745818eb6 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -1,6 +1,5 @@ -# SPDX-License-Identifier: Apache-2.0 - #!/usr/bin/env python +# SPDX-License-Identifier: Apache-2.0 """ Fused Attention =============== diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index ff1ae0ac8..160beaa14 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 # Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); -- GitLab From c11de33dad217bca79225128059b6fac7e1b2519 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Mon, 3 Feb 2025 16:04:59 -0500 Subject: [PATCH 19/65] [Bugfix][Kernel] Fix per-token/per-channel quantization for Hopper scaled mm (#12696) Signed-off-by: Tyler Michael Smith --- .../cutlass_w8a8/scaled_mm_c3x.cu | 59 ++++++++----------- 1 file changed, 24 insertions(+), 35 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu index 72d549e59..e40f28229 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu @@ -16,29 +16,11 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - using GroupShape = std::array; - int M = a.size(0), N = b.size(1), K = a.size(1); - GroupShape a_scale_group_shape = [&, &s = a_scales]() -> GroupShape { - if (s.numel() == 1) return {M, K}; // tensor-wise - if (s.dim() == 2) - return {ceil_div(a.size(0), s.size(0)), ceil_div(a.size(1), s.size(1))}; - TORCH_CHECK(false, "Unsupported scale shape for scale_a"); - }(); - - GroupShape b_scale_group_shape = [&, &s = b_scales]() -> GroupShape { - if (s.numel() == 1) return {K, N}; // tensor-wise - if (s.dim() == 2) - return {ceil_div(b.size(0), s.size(0)), ceil_div(b.size(1), s.size(1))}; - TORCH_CHECK(false, "Unsupported scale shape for scale_b"); - }(); - - if ((a_scale_group_shape == GroupShape{M, K} || - a_scale_group_shape == GroupShape{1, K}) && - (b_scale_group_shape == GroupShape{K, N} || - b_scale_group_shape == GroupShape{K, 1})) { - // "standard per-tensor/per-token/per-channel" scaling + if ((a_scales.numel() == 1 || a_scales.numel() == a.size(0)) && + (b_scales.numel() == 1 || b_scales.numel() == b.size(1))) { + // Standard per-tensor/per-token/per-channel scaling TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); if (a.dtype() == torch::kFloat8_e4m3fn) { vllm::cutlass_scaled_mm_sm90_fp8(c, a, b, a_scales, b_scales, bias); @@ -46,25 +28,32 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, TORCH_CHECK(a.dtype() == torch::kInt8); vllm::cutlass_scaled_mm_sm90_int8(c, a, b, a_scales, b_scales, bias); } - } else if (a_scale_group_shape == GroupShape{1, 128} && - b_scale_group_shape == GroupShape{128, 128}) { + } else { + using GroupShape = std::array; + auto make_group_shape = [](torch::Tensor const& x, + torch::Tensor const& s) -> GroupShape { + TORCH_CHECK(s.dim() == 2, "cutlass_scaled_mm group scales must be 2D"); + return {ceil_div(x.size(0), s.size(0)), ceil_div(x.size(1), s.size(1))}; + }; + + GroupShape a_scale_group_shape = make_group_shape(a, a_scales); + GroupShape b_scale_group_shape = make_group_shape(b, b_scales); + // 1x128 per-token group scales for activations // 128x128 blockwise scales for weights - TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn && - b.dtype() == torch::kFloat8_e4m3fn, - "Currently only FP8 is supported for A group shape 1x128 and " - "B group shape 128x128"); - TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm"); - - vllm::cutlass_scaled_mm_blockwise_sm90_fp8(c, a, b, a_scales, b_scales); - } else { - TORCH_CHECK(false, - "Unsupported scale group shapes for CUTLASS 3.x GEMM.\n " - "a_scale_group_shape must be [1, 128], got: [", + TORCH_CHECK((a_scale_group_shape == GroupShape{1, 128} && + b_scale_group_shape == GroupShape{128, 128} && + a.dtype() == torch::kFloat8_e4m3fn && + b.dtype() == torch::kFloat8_e4m3fn), + "cutlass_scaled_mm only supports datatype float8_e4m3fn.\n" + "a_scale_group_shape must be [1, 128]. Got: [", a_scale_group_shape[0], ", ", a_scale_group_shape[1], "]\n" - "b_scale_group_shape must be [128, 128], got: [", + "b_scale_group_shape must be [128, 128]. Got: [", b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]"); + TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm"); + + vllm::cutlass_scaled_mm_blockwise_sm90_fp8(c, a, b, a_scales, b_scales); } } -- GitLab From 6dd5e52823cc0ca8ddc9c4377d29ead37cc09a95 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 3 Feb 2025 16:29:56 -0500 Subject: [PATCH 20/65] Squelch MLA warning for Compressed-Tensors Models (#12704) Signed-off-by: Kyle Sayers --- vllm/config.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index d70a63795..2f4a7ad76 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -986,6 +986,9 @@ class ModelConfig: @property def use_mla(self) -> bool: + if not self.is_deepseek_mla or envs.VLLM_MLA_DISABLE: + return False + if self.quantization is not None and self.quantization not in [\ "fp8", "compressed-tensors"]: logger.warning( @@ -1012,8 +1015,7 @@ class ModelConfig: quant_config) return False - use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE) - return use_mla + return True @property def supported_runner_types(self) -> Set[RunnerType]: -- GitLab From 4797dad3ec48c2b1fa8a1e4cc53c7854675b6b8d Mon Sep 17 00:00:00 2001 From: kushanam <42385577+kushanam@users.noreply.github.com> Date: Mon, 3 Feb 2025 13:30:39 -0800 Subject: [PATCH 21/65] [Model] Add Deepseek V3 fp8_w8a8 configs for B200 (#12707) --- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ 13 files changed, 1898 insertions(+) create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000..77ba0d747 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000..0a5d7bfdb --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000..cb91a279d --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000..7febe3d27 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000..9d7658bfc --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000..03dba5ad1 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000..9a5ff48b8 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000..386928de1 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000..9c908e804 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000..f78e7060e --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000..1d3ce5c94 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000..3ab5796ee --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000..3cb7eaa07 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + } +} -- GitLab From cf58b9c4cab7a90e56a45d30edfc43912b9a0a56 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Mon, 3 Feb 2025 13:34:16 -0800 Subject: [PATCH 22/65] [MISC] Remove model input dumping when exception (#12582) Signed-off-by: Cody Yu --- .github/ISSUE_TEMPLATE/400-bug-report.yml | 9 --- .../test_basic_correctness.py | 58 ------------------ vllm/worker/model_runner.py | 3 +- vllm/worker/model_runner_base.py | 61 +------------------ 4 files changed, 3 insertions(+), 128 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml index 30db1721a..d4113da8b 100644 --- a/.github/ISSUE_TEMPLATE/400-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml @@ -30,15 +30,6 @@ body: validations: required: true -- type: textarea - attributes: - label: Model Input Dumps - description: | - If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process. - placeholder: | - Upload the dumped input file. - validations: - required: false - type: textarea attributes: label: 🐛 Describe the bug diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 2792dfde7..f001a8935 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -4,16 +4,12 @@ Run `pytest tests/basic_correctness/test_basic_correctness.py`. """ import os -import pickle -import re import weakref -from unittest.mock import patch import pytest from vllm import LLM from vllm.platforms import current_platform -from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata from ..conftest import VllmRunner from ..models.utils import check_outputs_equal @@ -151,57 +147,3 @@ def test_models_distributed( name_0="hf", name_1="vllm", ) - - -@pytest.mark.skip_v1 -def test_model_with_failure(vllm_runner) -> None: - try: - with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward", - side_effect=ValueError()): - with pytest.raises(ValueError) as exc_info: - vllm_runner("facebook/opt-125m", - dtype="half", - enforce_eager=False, - gpu_memory_utilization=0.7) - matches = re.search(r"input dumped to (.+).pkl", - str(exc_info.value)) - assert matches is not None - filename = f"{matches.group(1)}.pkl" - - with open(filename, "rb") as filep: - inputs = pickle.load(filep) - - if any(key not in inputs for key in ("arg_1", "arg_2", "arg_3")): - raise AssertionError("Missing keys in dumped inputs. Dumped keys: " - f"{list(inputs.keys())}") - assert isinstance(inputs["arg_1"], - ModelInputForGPUWithSamplingMetadata) - finally: - os.remove(filename) - - -@pytest.mark.skip_v1 -def test_failure_with_async_out_proc(vllm_runner) -> None: - - filename = None - try: - with vllm_runner("facebook/opt-125m", - dtype="half", - enforce_eager=False, - gpu_memory_utilization=0.7) as vllm_model,\ - patch("vllm.model_executor.models.opt.OPTForCausalLM.forward", - side_effect=ValueError()): - model_config = vllm_model.model.llm_engine.model_config - assert model_config.use_async_output_proc - with pytest.raises(ValueError) as exc_info: - vllm_model.generate_greedy('how to make pizza?', 250) - matches = re.search(r"input dumped to (.+).pkl", - str(exc_info.value)) - assert matches is not None - - filename = f"{matches.group(1)}.pkl" - finally: - # Clean up - if filename is not None: - os.remove(filename) - pass diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 90f08b1df..0bbba55b3 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -57,7 +57,7 @@ from vllm.worker.model_runner_base import ( _add_attn_metadata_broadcastable_dict, _add_sampling_metadata_broadcastable_dict, _init_attn_metadata_from_tensor_dict, - _init_sampling_metadata_from_tensor_dict, dump_input_when_exception) + _init_sampling_metadata_from_tensor_dict) if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionBackend @@ -1647,7 +1647,6 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): virtual_engine=virtual_engine) @torch.inference_mode() - @dump_input_when_exception(exclude_args=[0], exclude_kwargs=["self"]) def execute_model( self, model_input: ModelInputForGPUWithSamplingMetadata, diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index 9e33ef9f1..38d2b712e 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -1,16 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -import pickle from abc import ABC, abstractmethod -from datetime import datetime -from functools import wraps -from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, - Optional, Type, TypeVar) +from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type, + TypeVar) import torch import torch.nn as nn -from torch import is_tensor from vllm.config import VllmConfig from vllm.logger import init_logger @@ -107,59 +103,6 @@ def _init_frozen_model_input_from_tensor_dict( return tensor_dict -def dump_input_when_exception(exclude_args: Optional[List[int]] = None, - exclude_kwargs: Optional[List[str]] = None): - - def _inner(func): - - @wraps(func) - def _wrapper(*args, **kwargs): - try: - return func(*args, **kwargs) - except Exception as err: - timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") - filename = f"/tmp/err_{func.__name__}_input_{timestamp}.pkl" - logger.info("Writing input of failed execution to %s...", - filename) - with open(filename, "wb") as filep: - dumped_inputs = { - k: v - for k, v in kwargs.items() - if k not in (exclude_kwargs or []) - } - for i, arg in enumerate(args): - if i not in (exclude_args or []): - dumped_inputs[f"arg_{i}"] = arg - - # Only persist dtype and shape for kvcache tensors - # (can be way to big otherwise) - if (kv_caches := dumped_inputs.get("kv_caches")) \ - and isinstance(kv_caches, Iterable): - dumped_inputs["kv_caches"] = [(t.dtype, t.shape) - for t in kv_caches - if is_tensor(t)] - - try: - pickle.dump(dumped_inputs, filep) - except Exception as pickle_err: - logger.warning( - "Failed to pickle inputs of failed execution: %s", - str(pickle_err)) - raise type(err)(f"Error in model execution: " - f"{str(err)}") from err - - logger.info( - "Completed writing input of failed execution to %s.", - filename) - raise type(err)( - f"Error in model execution (input dumped to {filename}): " - f"{str(err)}") from err - - return _wrapper - - return _inner - - class BroadcastableModelInput(ABC): @abstractmethod -- GitLab From 5095e966069b9e65b7c4c63427e06cebacaad0a0 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Mon, 3 Feb 2025 15:04:53 -0800 Subject: [PATCH 23/65] [V1] Revert `uncache_blocks` and support recaching full blocks (#12415) Signed-off-by: Cody Yu --- tests/v1/core/test_prefix_caching.py | 30 -------------------- vllm/v1/core/kv_cache_manager.py | 41 +++++++++++----------------- 2 files changed, 16 insertions(+), 55 deletions(-) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 2e16d7d25..a6c0162d3 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -629,33 +629,3 @@ def test_reset_prefix_cache(): assert manager.reset_prefix_cache() assert not manager.cached_block_hash_to_block assert all([blk.block_hash is None for blk in manager.block_pool]) - - -def test_uncache_blocks(): - manager = KVCacheManager( - block_size=16, - num_gpu_blocks=10, - max_model_len=8192, - sliding_window=None, - enable_caching=True, - num_preallocate_tokens=0, - ) - - req0 = make_request("0", list(range(30))) - blocks = manager.allocate_slots(req0, 30) - assert [b.block_id for b in blocks] == [0, 1] - assert len(manager.cached_block_hash_to_block) == 1 - - req0.num_computed_tokens = 30 - - # Simulate speculative tokens. - for _ in range(5): - req0.append_output_token_ids(8) - manager.allocate_slots(req0, 5) - assert len(manager.cached_block_hash_to_block) == 2 - - # After sampling, assuming only 1 token is accepted. - req0.num_computed_tokens = 31 - num_uncached_blocks = manager.uncache_blocks(req0) - assert num_uncached_blocks == 1 - assert len(manager.cached_block_hash_to_block) == 1 diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 94086e4a1..de349ec12 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -252,29 +252,6 @@ class KVCacheManager: if block.ref_cnt == 0: self.free_block_queue.append(block) - def uncache_blocks(self, request: Request) -> int: - """Uncache the blocks that are no longer full based on the - num_computed_tokens in the given request. This happens when - the blocks were full and cached due to speculative tokens, but the - speculative tokens are not accepted. - - Args: - request: The request. - - Returns: - The number of uncached blocks. - """ - blocks = self.req_to_blocks[request.request_id] - num_computed_tokens = request.num_computed_tokens - num_full_blocks = num_computed_tokens // self.block_size - num_uncached_blocks = 0 - for block in blocks[num_full_blocks:]: - # If the block is not cached, the following blocks are not cached. - if not self._maybe_evict_cached_block(block): - break - num_uncached_blocks += 1 - return num_uncached_blocks - def reset_prefix_cache(self) -> bool: """Reset prefix cache. This function may be used in RLHF flows to invalid prefix caching after the weights are updated, @@ -470,8 +447,22 @@ class KVCacheManager: assert prev_block.block_hash is not None prev_block_hash_value = prev_block.block_hash.hash_value - for i, blk in enumerate(full_blocks): - blk_idx = blk_start_idx + i + # Find the first uncached block. This case should only happen when + # speculative decoding is used. + offset = 0 + for blk in full_blocks: + if blk.block_hash is None: + break + else: + prev_block_hash_value = blk.block_hash.hash_value + offset += 1 + else: + # All blocks are cached. + return + + for i, blk in enumerate(full_blocks[offset:]): + blk_idx = blk_start_idx + offset + i + assert blk.block_hash is None if blk_idx < num_cached_block_hashes: # The block hash may already be computed in -- GitLab From 73b35cca7f3745d07d439c197768b25d88b6ab7f Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Mon, 3 Feb 2025 19:28:20 -0500 Subject: [PATCH 24/65] [Core] Improve hash collision avoidance in prefix caching (#12621) Signed-off-by: Russell Bryant --- tests/core/block/test_prefix_caching_block.py | 4 +- vllm/core/block/prefix_caching_block.py | 42 +++++++++++++++---- vllm/v1/core/kv_cache_utils.py | 9 ++++ 3 files changed, 45 insertions(+), 10 deletions(-) diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index 771627a57..bf40b334a 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -65,8 +65,8 @@ class TestPrefixCachingBlock: previous_block = MagicMock(spec=PrefixCachingBlock) prev_block_hash = random.randint(0, 1000) - previous_block.content_hash = (prev_block_hash - if prev_block_has_hash else None) + previous_block.content_hash = (prev_block_hash if prev_block_has_hash + else hash('None')) num_to_fill = block_size if is_curr_block_full else random.randint( 0, block_size - 1) diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index fbf19e1b4..1ca9e49da 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -65,6 +65,15 @@ class PrefixCachingBlockAllocator(BlockAllocator): from 0 to num_blocks - 1. """ + # Note that we use 'None' as a string here instead of None because + # as of Python 3.12, hash(None) returns a constant predictable value. + # This could possibly make it easier to find and exploit hash + # collisions. 'None' as a string will be hashed differently per process, + # but consistently within the same process. This is the same as the + # behavior of None prior to Python 3.12. + _none_hash: int = hash('None') + + # Implements Block.Factory. def __init__( self, num_blocks: int, @@ -122,7 +131,6 @@ class PrefixCachingBlockAllocator(BlockAllocator): self.metric_data = CacheMetricData() - # Implements Block.Factory. def _create_block( self, prev_block: Optional[Block], @@ -737,6 +745,14 @@ class PrefixCachingBlock(Block): such as adapters that influence the block, apart from the token_ids. """ + # Note that we use 'None' as a string here instead of None because + # as of Python 3.12, hash(None) returns a constant predictable value. + # This could possibly make it easier to find and exploit hash + # collisions. 'None' as a string will be hashed differently per process, + # but consistently within the same process. This is the same as the + # behavior of None prior to Python 3.12. + _none_hash: int = hash('None') + def __init__( self, prev_block: Optional[Block], @@ -891,13 +907,13 @@ class PrefixCachingBlock(Block): is_first_block = self._prev_block is None prev_block_hash = ( - None if is_first_block else + self._none_hash if is_first_block else self._prev_block.content_hash # type: ignore ) # Previous block exists but does not yet have a hash. # Return no hash in this case. - if prev_block_hash is None and not is_first_block: + if prev_block_hash == self._none_hash and not is_first_block: return None self._cached_content_hash = PrefixCachingBlock.hash_block_tokens( @@ -907,8 +923,9 @@ class PrefixCachingBlock(Block): extra_hash=self._extra_hash) return self._cached_content_hash - @staticmethod - def hash_block_tokens(is_first_block: bool, + @classmethod + def hash_block_tokens(cls, + is_first_block: bool, prev_block_hash: Optional[int], cur_block_token_ids: List[int], extra_hash: Optional[int] = None) -> int: @@ -929,7 +946,8 @@ class PrefixCachingBlock(Block): Returns: - int: The computed hash value for the block. """ - assert (prev_block_hash is None) == is_first_block + if is_first_block and prev_block_hash is None: + prev_block_hash = cls._none_hash return hash((is_first_block, prev_block_hash, *cur_block_token_ids, extra_hash)) @@ -949,6 +967,14 @@ class ComputedBlocksTracker: cached block hashes in the allocator. """ + # Note that we use 'None' as a string here instead of None because + # as of Python 3.12, hash(None) returns a constant predictable value. + # This could possibly make it easier to find and exploit hash + # collisions. 'None' as a string will be hashed differently per process, + # but consistently within the same process. This is the same as the + # behavior of None prior to Python 3.12. + _none_hash: int = hash('None') + def __init__( self, allocator: DeviceAwareBlockAllocator, @@ -994,7 +1020,7 @@ class ComputedBlocksTracker: # We need to know the hash of the previous block to compute the hash of # the current block so that blocks could be uniquely identified across # sequences of prefixes. - prev_block_hash = (None if cur_num_blocks_recorded == 0 else + prev_block_hash = (self._none_hash if cur_num_blocks_recorded == 0 else block_hashes_recorded[-1]) # Only update the computed block hashes for the new blocks for i in range(cur_num_blocks_recorded, num_computed_blocks): @@ -1009,7 +1035,7 @@ class ComputedBlocksTracker: # This has to be kept in sync with the allocator's hash # calculation. block_hash = PrefixCachingBlock.hash_block_tokens( - is_first_block=prev_block_hash is None, + is_first_block=prev_block_hash == self._none_hash, prev_block_hash=prev_block_hash, cur_block_token_ids=block_token_ids, extra_hash=extra_hash, diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index c801ab9e4..e0976ba85 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -263,6 +263,15 @@ def hash_block_tokens( The hash value of the block and the token ids in the block. The entire tuple is used as the hash key of the block. """ + if not parent_block_hash: + # Note that we use 'None' as a string here instead of None because + # as of Python 3.12, hash(None) returns a constant predictable value. + # This could possibly make it easier to find and exploit hash + # collisions. 'None' as a string will be hashed differently per process, + # but consistently within the same process. This is the same as the + # behavior of None prior to Python 3.12. + parent_block_hash = hash('None') + curr_block_token_ids_tuple = tuple(curr_block_token_ids) return BlockHashType( hash((parent_block_hash, curr_block_token_ids_tuple, extra_keys)), -- GitLab From 5d98d56089426555d303d82bdf29b9ffc825ec20 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 3 Feb 2025 22:55:46 -0500 Subject: [PATCH 25/65] Support Pixtral-Large HF by using llava multimodal_projector_bias config (#12710) Signed-off-by: mgoin --- vllm/model_executor/models/llava.py | 6 ++++-- vllm/model_executor/models/llava_next.py | 3 ++- vllm/model_executor/models/llava_next_video.py | 9 +++++---- vllm/model_executor/models/llava_onevision.py | 4 ++-- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index de3777cad..19effcbfc 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -75,19 +75,20 @@ class LlavaMultiModalProjector(nn.Module): vision_hidden_size: int, text_hidden_size: int, projector_hidden_act: str, + multimodal_projector_bias: bool, quant_config: Optional[QuantizationConfig] = None, prefix: str = ""): super().__init__() self.linear_1 = ColumnParallelLinear(vision_hidden_size, text_hidden_size, - bias=True, + bias=multimodal_projector_bias, quant_config=quant_config, prefix=f"{prefix}.linear_1") self.act = get_act_fn(projector_hidden_act) self.linear_2 = RowParallelLinear(text_hidden_size, text_hidden_size, - bias=True, + bias=multimodal_projector_bias, quant_config=quant_config, prefix=f"{prefix}.linear_2") @@ -503,6 +504,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, projector_hidden_act=config.projector_hidden_act, + multimodal_projector_bias=config.multimodal_projector_bias, quant_config=quant_config, prefix=maybe_prefix(prefix, "multi_modal_projector")) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 185edcb8d..defdeb54a 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -231,7 +231,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=vision_hidden_size, text_hidden_size=config.text_config.hidden_size, - projector_hidden_act=config.projector_hidden_act) + projector_hidden_act=config.projector_hidden_act, + multimodal_projector_bias=config.multimodal_projector_bias) self.language_model = init_vllm_registered_model( vllm_config=vllm_config, diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index a50025135..d70ae2f14 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -253,16 +253,16 @@ class LlavaNextVideoPooler(nn.Module): class LlavaNextMultiModalProjector(nn.Module): def __init__(self, vision_hidden_size: int, text_hidden_size: int, - projector_hidden_act: str): + projector_hidden_act: str, multimodal_projector_bias: bool): super().__init__() self.linear_1 = nn.Linear(vision_hidden_size, text_hidden_size, - bias=True) + bias=multimodal_projector_bias) self.act = get_act_fn(projector_hidden_act) self.linear_2 = nn.Linear(text_hidden_size, text_hidden_size, - bias=True) + bias=multimodal_projector_bias) def forward(self, image_features: torch.Tensor) -> torch.Tensor: hidden_states = self.linear_1(image_features) @@ -298,7 +298,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, self.multi_modal_projector = LlavaNextMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, - projector_hidden_act=config.projector_hidden_act) + projector_hidden_act=config.projector_hidden_act, + multimodal_projector_bias=config.multimodal_projector_bias) self.language_model = init_vllm_registered_model( vllm_config=vllm_config, hf_config=config.text_config, diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index ac502000c..f1c06cd85 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -372,11 +372,11 @@ class LlavaOnevisionMultiModalProjector(nn.Module): self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, - bias=True) + bias=config.multimodal_projector_bias) self.act = get_act_fn(config.projector_hidden_act) self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, - bias=True) + bias=config.multimodal_projector_bias) def forward(self, image_features: torch.Tensor) -> torch.Tensor: hidden_states = self.linear_1(image_features) -- GitLab From bb392af434a49b3f8655f0e78737ced6524056b7 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Tue, 4 Feb 2025 02:05:04 -0500 Subject: [PATCH 26/65] [Doc] Replace ibm-fms with ibm-ai-platform (#12709) Signed-off-by: Thomas Parnell --- docs/source/features/spec_decode.md | 12 ++++++------ examples/offline_inference/mlpspeculator.py | 2 +- tests/models/registry.py | 2 +- tests/spec_decode/e2e/test_mlp_correctness.py | 2 +- vllm/model_executor/models/mlp_speculator.py | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md index da8712705..1e468962c 100644 --- a/docs/source/features/spec_decode.md +++ b/docs/source/features/spec_decode.md @@ -131,7 +131,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) llm = LLM( model="meta-llama/Meta-Llama-3.1-70B-Instruct", tensor_parallel_size=4, - speculative_model="ibm-fms/llama3-70b-accelerator", + speculative_model="ibm-ai-platform/llama3-70b-accelerator", speculative_draft_tensor_parallel_size=1, ) outputs = llm.generate(prompts, sampling_params) @@ -149,11 +149,11 @@ limitation will be fixed in a future release. A variety of speculative models of this type are available on HF hub: -- [llama-13b-accelerator](https://huggingface.co/ibm-fms/llama-13b-accelerator) -- [llama3-8b-accelerator](https://huggingface.co/ibm-fms/llama3-8b-accelerator) -- [codellama-34b-accelerator](https://huggingface.co/ibm-fms/codellama-34b-accelerator) -- [llama2-70b-accelerator](https://huggingface.co/ibm-fms/llama2-70b-accelerator) -- [llama3-70b-accelerator](https://huggingface.co/ibm-fms/llama3-70b-accelerator) +- [llama-13b-accelerator](https://huggingface.co/ibm-ai-platform/llama-13b-accelerator) +- [llama3-8b-accelerator](https://huggingface.co/ibm-ai-platform/llama3-8b-accelerator) +- [codellama-34b-accelerator](https://huggingface.co/ibm-ai-platform/codellama-34b-accelerator) +- [llama2-70b-accelerator](https://huggingface.co/ibm-ai-platform/llama2-70b-accelerator) +- [llama3-70b-accelerator](https://huggingface.co/ibm-ai-platform/llama3-70b-accelerator) - [granite-3b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator) - [granite-8b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator) - [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator) diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py index 10d9de8cb..f227e71ba 100644 --- a/examples/offline_inference/mlpspeculator.py +++ b/examples/offline_inference/mlpspeculator.py @@ -51,7 +51,7 @@ if __name__ == "__main__": # Create an LLM with spec decoding llm = LLM( model="meta-llama/Llama-2-13b-chat-hf", - speculative_model="ibm-fms/llama-13b-accelerator", + speculative_model="ibm-ai-platform/llama-13b-accelerator", ) print("With speculation") diff --git a/tests/models/registry.py b/tests/models/registry.py index 8a0ade4fa..7b5032f79 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -278,7 +278,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { "MedusaModel": _HfExamplesInfo("JackFram/llama-68m", speculative_model="abhigoyal/vllm-medusa-llama-68m-random"), # noqa: E501 "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m", - speculative_model="ibm-fms/llama-160m-accelerator"), # noqa: E501 + speculative_model="ibm-ai-platform/llama-160m-accelerator"), # noqa: E501 } _FALLBACK_MODEL = { diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py index a2b84b902..59beca47a 100644 --- a/tests/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/spec_decode/e2e/test_mlp_correctness.py @@ -33,7 +33,7 @@ from .conftest import run_equality_correctness_test MAIN_MODEL = "JackFram/llama-160m" # speculative model -SPEC_MODEL = "ibm-fms/llama-160m-accelerator" +SPEC_MODEL = "ibm-ai-platform/llama-160m-accelerator" # max. number of speculative tokens: this corresponds to # n_predict in the config.json of the speculator model. diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index cf4123a2c..2920427f9 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -64,7 +64,7 @@ class MLPSpeculator(nn.Module): https://arxiv.org/pdf/2404.19124 Trained speculators of this type are available on HF hub at: - https://huggingface.co/ibm-fms and https://huggingface.co/ibm-granite + https://huggingface.co/ibm-ai-platform and https://huggingface.co/ibm-granite """ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: -- GitLab From 4896d0c2dd367efbdf8387028322bf5f74359930 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 4 Feb 2025 02:27:11 -0500 Subject: [PATCH 27/65] [Quant] Fix use_mla TypeError and support loading pure-sparsity Compressed Tensors configs (#12711) --- vllm/config.py | 5 +++-- .../quantization/compressed_tensors/compressed_tensors.py | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 2f4a7ad76..bc4bf627b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1000,8 +1000,9 @@ class ModelConfig: # have fp8 for both weights and activations. if self.quantization == "compressed-tensors": quant_config = self._parse_quant_hf_config() - for group_name, cfg in quant_config.get("config_groups", - ("", {})).items(): + for group_name, cfg in quant_config.get("config_groups", { + "": {} + }).items(): act_cfg = cfg.get("input_activations", {}) act_type = None if act_cfg is None else act_cfg.get("type", "") w_cfg = cfg.get("weights", {}) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 24f7542e1..1a11b2419 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -424,6 +424,11 @@ class CompressedTensorsConfig(QuantizationConfig): or input_quant is not None, weight_quant=weight_quant, input_quant=input_quant) + elif weight_quant is None: + logger.warning_once("Acceleration for non-quantized schemes is " + "not supported by Compressed Tensors. " + "Falling back to UnquantizedLinearMethod") + return None else: # Find the quant_scheme scheme = self._get_scheme_from_parts( # type: ignore -- GitLab From c36ac98d0118537ec5f3f405a68311a10f9b59a5 Mon Sep 17 00:00:00 2001 From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Date: Tue, 4 Feb 2025 03:24:11 -0500 Subject: [PATCH 28/65] [AMD][ROCm] Enable DeepSeek model on ROCm (#12662) Signed-off-by: Hongxia Yang Co-authored-by: Matthew Wong --- tests/kernels/test_rocm_attention_selector.py | 31 +++++++++++++++++++ tests/worker/test_model_runner.py | 9 ++++++ vllm/attention/backends/mla/utils.py | 6 +++- .../layers/quantization/utils/fp8_utils.py | 10 ++++++ vllm/platforms/rocm.py | 3 ++ 5 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 tests/kernels/test_rocm_attention_selector.py diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py new file mode 100644 index 000000000..5848dc014 --- /dev/null +++ b/tests/kernels/test_rocm_attention_selector.py @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import patch + +import pytest +import torch + +from tests.kernels.utils import override_backend_env_variable +from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend +from vllm.platforms.rocm import RocmPlatform + + +@pytest.fixture(autouse=True) +def clear_cache(): + """Clear lru cache to ensure each test case runs without caching. + """ + _cached_get_attn_backend.cache_clear() + + +def test_selector(monkeypatch): + """Test that the attention selector for ROCm. + """ + override_backend_env_variable(monkeypatch, "ROCM_FLASH") + + with patch("vllm.attention.selector.current_platform", RocmPlatform()): + backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) + assert backend.get_name() == "ROCM_FLASH" + # mla test for deepseek related + backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, + False, True) + assert backend.get_name() == "TRITON_MLA" diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index c32ceb4fa..3f9a0d6fa 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -24,6 +24,15 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner: return model_runner +def test_deepseek_mla_attn_backend_module(): + model_runner = _create_model_runner( + "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", + trust_remote_code=True, + enable_chunked_prefill=False, + ) + assert model_runner.attn_backend.__name__ == "TritonMLABackend" + + @pytest.mark.parametrize("batch_size", list(range(1, 257))) def test_prepare_prompt(batch_size): model_runner = _create_model_runner( diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py index 9b63192ed..8e584cca3 100644 --- a/vllm/attention/backends/mla/utils.py +++ b/vllm/attention/backends/mla/utils.py @@ -27,7 +27,11 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import ( scaled_dequantize, scaled_quantize) from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding -from vllm.vllm_flash_attn import flash_attn_varlen_func + +try: + from vllm.vllm_flash_attn import flash_attn_varlen_func +except ImportError: + from flash_attn import flash_attn_varlen_func @dataclass diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 29c7268ad..10ff71e57 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -47,6 +47,16 @@ def apply_w8a8_block_fp8_linear( shape_supported_by_cutlass = (weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0) + if current_platform.is_rocm(): + scale_a_shape = ((input_2d.shape[-1] // block_size[1], ) + + input_2d.shape[:-1])[::-1] + scale_b_shape = (weight_scale.view(-1, 1) + if weight_scale.dim() <= 1 else weight_scale.T).shape + ar, ac = scale_a_shape + br, bc = scale_b_shape + if (ac > 1 or bc > 1 or ar not in (1, input_2d.shape[0]) + or br not in (1, weight.shape[0])): + shape_supported_by_cutlass = False if cutlass_block_fp8_supported and shape_supported_by_cutlass: q_input, x_scale = per_token_group_quant_fp8(input_2d, block_size[1], diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index cd851c0d8..035766289 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -79,6 +79,9 @@ class RocmPlatform(Platform): def get_attn_backend_cls(cls, selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, use_mla) -> str: + if use_mla: + logger.info("Using Triton MLA backend.") + return "vllm.attention.backends.triton_mla.TritonMLABackend" selected_backend = (_Backend.ROCM_FLASH if selected_backend == _Backend.FLASH_ATTN else selected_backend) if selected_backend == _Backend.ROCM_FLASH: -- GitLab From 96b23621c16d4e3b65380c6af3a7d7bac79cfa5b Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 4 Feb 2025 16:27:36 +0800 Subject: [PATCH 29/65] [Misc] Add BNB quantization for Whisper (#12381) Signed-off-by: Jee Jee Li --- vllm/model_executor/model_loader/loader.py | 102 ++++++++++++--------- vllm/model_executor/model_loader/utils.py | 7 ++ vllm/model_executor/models/whisper.py | 17 +++- 3 files changed, 82 insertions(+), 44 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 809af81d7..19e3bc6a2 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -803,9 +803,11 @@ class BitsAndBytesModelLoader(BaseModelLoader): iterator = safetensors_weights_iterator(hf_weights_files) else: iterator = pt_weights_iterator(hf_weights_files) - for name, param in iterator: - # mapping weight names from transformers to vllm. - yield self.weight_mapper(name), param + for org_name, param in iterator: + # mapping weight names from transformers to vllm while preserving + # original names. + mapped_name = self.weight_mapper(org_name) + yield org_name, mapped_name, param def _get_quantized_weights_iterator( self, @@ -866,24 +868,30 @@ class BitsAndBytesModelLoader(BaseModelLoader): def _quantized_8bit_generator(self, hf_weights_files, use_safetensors, quant_state_dict) -> Generator: - for weight_name, weight_tensor in self._hf_weight_iter( - hf_weights_files, use_safetensors): - if not weight_name.lower().endswith(".scb"): + for ( + org_weight_name, + mapped_weight_name, + weight_tensor, + ) in self._hf_weight_iter(hf_weights_files, use_safetensors): + if not mapped_weight_name.lower().endswith(".scb"): continue - weight_key = weight_name.lower().replace(".scb", ".weight") + weight_key = mapped_weight_name.lower().replace(".scb", ".weight") quant_state_dict[weight_key] = weight_tensor - for weight_name, weight_tensor in self._hf_weight_iter( - hf_weights_files, use_safetensors): - if self._is_8bit_weight_name(weight_name): + for ( + org_weight_name, + mapped_weight_name, + weight_tensor, + ) in self._hf_weight_iter(hf_weights_files, use_safetensors): + if self._is_8bit_weight_name(mapped_weight_name): continue - if weight_name in quant_state_dict: + if mapped_weight_name in quant_state_dict: set_weight_attrs(weight_tensor, {"load_in_8bit": True}) - yield weight_name, weight_tensor + yield org_weight_name, weight_tensor else: - yield weight_name, weight_tensor + yield org_weight_name, weight_tensor def _quantized_4bit_generator(self, hf_weights_files, use_safetensors, quant_state_dict) -> Generator: @@ -893,15 +901,19 @@ class BitsAndBytesModelLoader(BaseModelLoader): weight_iterator = self._hf_weight_iter(hf_weights_files, use_safetensors) temp_state_dict = {} - for weight_name, weight_tensor in weight_iterator: - if not self._is_4bit_weight_name(weight_name): + for ( + org_weight_name, + mapped_weight_name, + weight_tensor, + ) in weight_iterator: + if not self._is_4bit_weight_name(mapped_weight_name): continue # bitsandbytes library requires # weight.quant_state.bitsandbytes__* in CPU - if "quant_state.bitsandbytes" in weight_name: - temp_state_dict[weight_name] = weight_tensor.cpu().data + if "quant_state.bitsandbytes" in mapped_weight_name: + temp_state_dict[mapped_weight_name] = weight_tensor.cpu().data else: - temp_state_dict[weight_name] = weight_tensor + temp_state_dict[mapped_weight_name] = weight_tensor # Closure to parse quant_state for each prequant weight def _parse_quant_state(param_name: str, @@ -915,20 +927,24 @@ class BitsAndBytesModelLoader(BaseModelLoader): # Second iterate over all prequant and normal weights # pre quantized weights would have a quant_state - for weight_name, weight_tensor in self._hf_weight_iter( - hf_weights_files, use_safetensors): - if self._is_4bit_weight_name(weight_name): + for ( + org_weight_name, + mapped_weight_name, + weight_tensor, + ) in self._hf_weight_iter(hf_weights_files, use_safetensors): + if self._is_4bit_weight_name(mapped_weight_name): continue - if (f"{weight_name}.quant_state.bitsandbytes__nf4" + if (f"{mapped_weight_name}.quant_state.bitsandbytes__nf4" in temp_state_dict) or ( - f"{weight_name}.quant_state.bitsandbytes__fp4" + f"{mapped_weight_name}.quant_state.bitsandbytes__fp4" in temp_state_dict): - quant_state = _parse_quant_state(weight_name, temp_state_dict) - quant_state_dict[weight_name] = quant_state - yield weight_name, weight_tensor + quant_state = _parse_quant_state(mapped_weight_name, + temp_state_dict) + quant_state_dict[mapped_weight_name] = quant_state + yield org_weight_name, weight_tensor else: - yield weight_name, weight_tensor + yield org_weight_name, weight_tensor def _unquantized_generator(self, hf_weights_files, use_safetensors, quant_state_dict) -> Generator: @@ -937,18 +953,22 @@ class BitsAndBytesModelLoader(BaseModelLoader): tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() - for weight_name, weight_tensor in self._hf_weight_iter( - hf_weights_files, use_safetensors): - if any(target_module in weight_name for target_module in - self.target_modules) and weight_name.endswith(".weight"): + for ( + org_weight_name, + mapped_weight_name, + weight_tensor, + ) in self._hf_weight_iter(hf_weights_files, use_safetensors): + if any(target_module in mapped_weight_name + for target_module in self.target_modules + ) and mapped_weight_name.endswith(".weight"): # Without sharding if any( - weight_name.startswith(module) + mapped_weight_name.startswith(module) for module in self.unsharded_weights_modules): weight_sub_tensor = weight_tensor # Shard by column elif any( - weight_name.startswith(module) + mapped_weight_name.startswith(module) for module in self.column_sharded_weights_modules): total_size = weight_tensor.size(-1) start_index = total_size // tp_size * tp_rank @@ -958,14 +978,14 @@ class BitsAndBytesModelLoader(BaseModelLoader): # Weights have fused on disk. In this case, we assume that the # weight and module use same name. elif any( - weight_name.startswith(module) + mapped_weight_name.startswith(module) for module in self.maybe_fused_weights_modules): # special case for fused weights # get the size of each shard weight tensor total_shard_sizes = next( (sizes for module, sizes in self.maybe_fused_weights_modules.items() - if weight_name.startswith(module))) + if mapped_weight_name.startswith(module))) total_size = weight_tensor.size(0) assert total_size == sum(total_shard_sizes) # get the start/end index of each shard weight tensor @@ -1008,23 +1028,21 @@ class BitsAndBytesModelLoader(BaseModelLoader): quant_type="nf4", ) - quant_state_dict[weight_name] = quant_state + quant_state_dict[mapped_weight_name] = quant_state else: processed_weight = weight_tensor - - yield weight_name, processed_weight + yield org_weight_name, processed_weight def _get_bnb_target_modules(self, model: nn.Module) -> None: for name, module in model.named_modules(): if isinstance(module, (LinearBase, )): - last_name = name.split(".")[-1] - if sub_modules := self.modules_mapping.packed_mapping.get( - last_name, []): + if modules_info := self.modules_mapping.get_sub_modules(name): # Map vllm's names to transformers's names. + rep_name, sub_modules = modules_info for sub_name in sub_modules: self.target_modules.append( - name.replace(last_name, sub_name)) + name.replace(rep_name, sub_name)) # Add original module name even if the module has stacked map, # in case model has a mixture of disk-merged and disk-splitted # weights with same last name. diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index eb334c1fd..7a82a695c 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -131,3 +131,10 @@ class ParamMapping: packed_name, index, ) + + def get_sub_modules(self, + module_name: str) -> Optional[Tuple[str, List[str]]]: + for key, value in self.packed_mapping.items(): + if module_name.endswith(key): + return key, value + return None diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 2319c3160..0a3011d36 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -638,6 +638,19 @@ def input_mapper_for_whisper( @MULTIMODAL_REGISTRY.register_max_multimodal_tokens( "audio", get_max_whisper_audio_tokens) class WhisperForConditionalGeneration(nn.Module, SupportsMultiModal): + packed_modules_mapping = { + "self_attn.qkv_proj": [ + "self_attn.q_proj", + "self_attn.k_proj", + "self_attn.v_proj", + ], + "encoder_attn.kv_proj": ["encoder_attn.k_proj", "encoder_attn.v_proj"], + } + + hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={ + ".fc1.": ".mlp.fc1.", + ".fc2.": ".mlp.fc2." + }) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -731,10 +744,10 @@ class WhisperForConditionalGeneration(nn.Module, SupportsMultiModal): def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."]) - mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."}) + # add fake zeros bias for k_proj to state_dict weights = _create_fake_bias_for_k_proj(weights) - return loader.load_weights(weights, mapper=mapper) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) def _create_fake_bias_for_k_proj( -- GitLab From d1ca7df84d9f8853001bdf1c2900321d9cb5d64e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 4 Feb 2025 16:44:52 +0800 Subject: [PATCH 30/65] [VLM] Merged multi-modal processor for InternVL-based models (#12553) Signed-off-by: DarkLight1337 Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> --- docs/source/contributing/model/multimodal.md | 6 +- docs/source/models/supported_models.md | 10 +- .../vision_language/test_h2ovl.py | 131 --- .../vision_language/test_models.py | 2 +- .../vision_language/vlm_utils/model_utils.py | 37 +- .../multimodal/processing/test_common.py | 4 +- .../multimodal/processing/test_h2ovl.py | 142 +++ .../multimodal/processing/test_internvl.py | 207 +---- .../multimodal/processing/test_llava_next.py | 15 +- .../processing/test_llava_onevision.py | 15 +- .../multimodal/processing/test_phi3v.py | 5 +- .../multimodal/processing/test_qwen2_vl.py | 5 +- vllm/model_executor/models/aria.py | 6 +- vllm/model_executor/models/blip2.py | 6 +- vllm/model_executor/models/chameleon.py | 6 +- vllm/model_executor/models/deepseek_vl2.py | 6 +- vllm/model_executor/models/fuyu.py | 6 +- vllm/model_executor/models/h2ovl.py | 621 ++++++++----- vllm/model_executor/models/internvl.py | 823 +++++++++++------- vllm/model_executor/models/llava.py | 6 +- .../model_executor/models/llava_next_video.py | 6 +- vllm/model_executor/models/llava_onevision.py | 6 +- vllm/model_executor/models/minicpmo.py | 28 +- vllm/model_executor/models/minicpmv.py | 50 +- vllm/model_executor/models/nvlm_d.py | 186 +++- vllm/model_executor/models/phi3v.py | 6 +- vllm/model_executor/models/qwen.py | 12 +- vllm/model_executor/models/qwen2_audio.py | 6 +- vllm/model_executor/models/qwen2_vl.py | 31 +- vllm/model_executor/models/ultravox.py | 6 +- vllm/multimodal/inputs.py | 11 + vllm/multimodal/processing.py | 6 +- vllm/multimodal/profiling.py | 3 +- vllm/multimodal/registry.py | 4 +- 34 files changed, 1434 insertions(+), 986 deletions(-) delete mode 100644 tests/models/decoder_only/vision_language/test_h2ovl.py create mode 100644 tests/models/multimodal/processing/test_h2ovl.py diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md index 6c6f3b701..66a7554da 100644 --- a/docs/source/contributing/model/multimodal.md +++ b/docs/source/contributing/model/multimodal.md @@ -250,7 +250,11 @@ def get_max_image_tokens(self) -> int: And thus, we can override the method as: ```python -def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: +def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], +) -> Mapping[str, int]: return {"image": self.get_max_image_tokens()} ``` diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 4a0996469..fbdca189a 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -726,7 +726,7 @@ See [this page](#generative-models) for more information on how to use generativ * `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. * * ✅︎ - * + * \* - * `Idefics3ForConditionalGeneration` * Idefics3 * T + I @@ -799,7 +799,7 @@ See [this page](#generative-models) for more information on how to use generativ * ✅︎ - * `NVLM_D_Model` * NVLM-D 1.0 - * T + IE+ + * T + I+ * `nvidia/NVLM-D-72B`, etc. * * ✅︎ @@ -859,7 +859,11 @@ See [this page](#generative-models) for more information on how to use generativ + Multiple items can be inputted per text prompt for this modality. :::{note} -To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM. +To use DeepSeek-VL2 series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM. +::: + +:::{note} +H2O-VL series models will be available in V1 once we support backends other than FlashAttention. ::: :::{note} diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py deleted file mode 100644 index 9590adf6f..000000000 --- a/tests/models/decoder_only/vision_language/test_h2ovl.py +++ /dev/null @@ -1,131 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from typing import Optional, Tuple - -import pytest -import torch -from PIL.Image import Image -from transformers import AutoConfig - -# Import the functions to test -from vllm.model_executor.models.h2ovl import (calculate_num_blocks, - image_to_pixel_values_wrapper) -from vllm.multimodal.image import rescale_image_size - -models = [ - "h2oai/h2ovl-mississippi-800m", # Replace with your actual model names - "h2oai/h2ovl-mississippi-2b", -] - - -def run_preprocessing_test( - image: Image, - config, - max_dynamic_patch: Optional[int] = None, -) -> Tuple[torch.Tensor, int]: - """Test the image preprocessing and calculate expected blocks.""" - - if max_dynamic_patch is None: - max_dynamic_patch = config.max_dynamic_patch - - width, height = image.size - use_MSAC = config.use_msac - - # Create the mapper function with the provided configuration - mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_MSAC) - pixel_values = mapper(image) - - # Calculate the expected number of blocks - if use_MSAC: - # First pass - blocks1, _, _, aspect_ratio = calculate_num_blocks( - width, - height, - config.min_dynamic_patch, - max_dynamic_patch, - config.vision_config.image_size, - use_thumbnail=False, # Thumbnail is handled separately - prior_aspect_ratio=None, - ) - - # Second pass - blocks2, _, _, _ = calculate_num_blocks( - width, - height, - config.min_dynamic_patch, - max_dynamic_patch, - config.vision_config.image_size, - use_thumbnail=False, - prior_aspect_ratio=aspect_ratio, - ) - - # Add thumbnail if use_thumbnail is True and total_blocks > 1 - if config.use_thumbnail: - blocks1 += 1 if blocks1 > 1 else 0 - blocks2 += 1 if blocks2 > 1 else 0 - - # Total blocks is the sum of blocks from both passes minus overlapping - total_blocks = blocks1 + blocks2 - 1 - - expected_blocks = total_blocks - - else: - blocks, _, _, _ = calculate_num_blocks( - width, - height, - config.min_dynamic_patch, - max_dynamic_patch, - config.vision_config.image_size, - use_thumbnail=False, - prior_aspect_ratio=None, - ) - expected_blocks = blocks - - if config.use_thumbnail and expected_blocks > 1: - expected_blocks += 1 - - return pixel_values, expected_blocks - - -@pytest.mark.parametrize("model_name", models) -@pytest.mark.parametrize( - "size_factors", - [ - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("max_dynamic_patch", [None, 2, 4, 8]) -def test_image_preprocessing(image_assets, model_name, size_factors, - max_dynamic_patch): - """Test image preprocessing pipeline with different configurations.""" - # Load the configuration from the model - config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) - - for asset in image_assets: - image = asset.pil_image - for factor in size_factors: - scaled_image = rescale_image_size(image, factor) - - # Test preprocessing and get expected number of blocks - pixel_values, expected_blocks = run_preprocessing_test( - scaled_image, config, max_dynamic_patch) - - # Verify output shapes and properties - actual_blocks = pixel_values.shape[0] - assert actual_blocks == expected_blocks, ( - f"Expected {expected_blocks} blocks, got {actual_blocks}") - - # Check image dimensions - expected_size = ( - 3, # Number of channels (C, H, W) - config.vision_config.image_size, - config.vision_config.image_size, - ) - for img in pixel_values: - assert img.shape == expected_size, ( - f"Expected image size {expected_size}, got {img.shape}") diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index e3cda8971..7a14ba2f3 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -250,6 +250,7 @@ VLM_TEST_SETTINGS = { max_model_len=8192, dtype="bfloat16", use_tokenizer_eos=True, + num_logprobs=10, patch_hf_runner=model_utils.h2ovl_patch_hf_runner, ), "idefics3": VLMTestInfo( @@ -282,7 +283,6 @@ VLM_TEST_SETTINGS = { dtype="bfloat16", use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, - marks=[large_gpu_mark(min_gb=32)], ), "llava_next": VLMTestInfo( models=["llava-hf/llava-v1.6-mistral-7b-hf"], diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index b0a88161c..d2401b222 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -334,12 +334,12 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: def __init__(self, hf_runner: HfRunner): self.num_image_token = hf_runner.model.num_image_token self.tokenizer = hf_runner.tokenizer - self.dtype = hf_runner.model.dtype self.config = AutoConfig.from_pretrained(hf_runner.model_name, trust_remote_code=True) self.vision_config = self.config.vision_config self.use_thumbnail = self.config.use_thumbnail + self.use_msac = self.config.use_msac self.min_num = self.config.min_dynamic_patch self.max_num = self.config.max_dynamic_patch self.image_size = self.vision_config.image_size @@ -348,18 +348,19 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: **kwargs): # yapf: disable from vllm.model_executor.models.h2ovl import ( - IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values) + IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl) # yapf: enable images = [images] if isinstance(images, Image) else images pixel_values = [ - image_to_pixel_values(image, - self.image_size, - self.min_num, - self.max_num, - self.use_thumbnail, - use_MSAC=self.config.use_msac).to( - self.dtype) for image in images + image_to_pixel_values_h2ovl( + image, + input_size=self.image_size, + min_num=self.min_num, + max_num=self.max_num, + use_thumbnail=self.use_thumbnail, + use_msac=self.use_msac, + ) for image in images ] num_patches_list = [ pixel_value.shape[0] for pixel_value in pixel_values @@ -394,7 +395,6 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: def __init__(self, hf_runner: HfRunner): self.num_image_token = hf_runner.model.num_image_token self.tokenizer = hf_runner.tokenizer - self.dtype = hf_runner.model.dtype self.config = AutoConfig.from_pretrained(hf_runner.model_name, trust_remote_code=True) @@ -407,13 +407,17 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: def __call__(self, text: str, images: Union[Image, List[Image]], **kwargs): from vllm.model_executor.models.internvl import ( - IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values) + IMG_CONTEXT, IMG_END, IMG_START, + image_to_pixel_values_internvl) images = [images] if isinstance(images, Image) else images pixel_values = [ - image_to_pixel_values(image, self.image_size, self.min_num, - self.max_num, - self.use_thumbnail).to(self.dtype) - for image in images + image_to_pixel_values_internvl( + image, + input_size=self.image_size, + min_num=self.min_num, + max_num=self.max_num, + use_thumbnail=self.use_thumbnail, + ) for image in images ] num_patches_list = [ pixel_value.shape[0] for pixel_value in pixel_values @@ -448,7 +452,8 @@ def _internvl_generate( ) -> torch.LongTensor: """Generate method for InternVL2 model without fixed use_cache.""" assert self.img_context_token_id is not None - vit_embeds = self.extract_feature(pixel_values) + target_dtype = next(self.parameters()).dtype + vit_embeds = self.extract_feature(pixel_values.to(target_dtype)) input_embeds = self.language_model.get_input_embeddings()(input_ids) B, N, C = input_embeds.shape input_embeds = input_embeds.reshape(B * N, C) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 3921d4e19..07906a71d 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -141,13 +141,14 @@ def _test_processing_correctness( # yapf: disable -# True if the model supports multiple data items of the modality per request @pytest.mark.parametrize("model_id", [ "rhymes-ai/Aria", "Salesforce/blip2-opt-2.7b", "facebook/chameleon-7b", "deepseek-ai/deepseek-vl2-tiny", "adept/fuyu-8b", + "h2oai/h2ovl-mississippi-800m", + "OpenGVLab/InternVL2-1B", "llava-hf/llava-1.5-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/LLaVA-NeXT-Video-7B-hf", @@ -156,6 +157,7 @@ def _test_processing_correctness( "mistral-community/pixtral-12b", "openbmb/MiniCPM-o-2_6", "openbmb/MiniCPM-V-2_6", + "nvidia/NVLM-D-72B", "Qwen/Qwen-VL-Chat", "Qwen/Qwen2-VL-2B-Instruct", "Qwen/Qwen2-Audio-7B-Instruct", diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py new file mode 100644 index 000000000..767ac5eb9 --- /dev/null +++ b/tests/models/multimodal/processing/test_h2ovl.py @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Tests for H2OVL's multimodal preprocessing kwargs.""" +from typing import Optional + +import pytest + +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.utils import cached_get_tokenizer + +from ....conftest import _ImageAssets +from ...utils import build_model_context + + +@pytest.mark.parametrize("model_id", [ + "h2oai/h2ovl-mississippi-800m", + "h2oai/h2ovl-mississippi-2b", +]) +@pytest.mark.parametrize( + "size_factors", + [ + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("max_dynamic_patch", [1, 2, 4, 8]) +@pytest.mark.parametrize("dynamic_image_size", [True, False]) +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_override( + model_id: str, + image_assets: _ImageAssets, + size_factors: list[int], + max_dynamic_patch: int, + dynamic_image_size: Optional[bool], + num_imgs: int, +): + from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets, + get_h2ovl_target_ratios) + + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + trust_remote_code=True, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + tokenizer = cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=tokenizer, + ) + + config = processor.info.get_hf_config() + use_msac = config.use_msac + + mm_processor_kwargs = { + "max_dynamic_patch": max_dynamic_patch, + } + if dynamic_image_size is not None: + mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size + + min_num = config.min_dynamic_patch + max_num = max_dynamic_patch if dynamic_image_size else 1 + + # Build the image str / prompt based on the number of images we pass + prompt = "" * num_imgs + + for asset in image_assets: + for factor in size_factors: + image = rescale_image_size(asset.pil_image, factor) + mm_data = {"image": [image] * num_imgs} + + width, height = image.size + + # Calculate the expected number of blocks + if num_imgs == 1 and use_msac: + # First pass + blocks1, _, _, aspect_ratio = calculate_h2ovl_targets( + orig_width=width, + orig_height=height, + target_ratios=get_h2ovl_target_ratios( + min_num, + max_num, + prior_aspect_ratio=None, + ), + image_size=config.vision_config.image_size, + use_thumbnail=False, # Thumbnail is handled separately + ) + + # Second pass + blocks2, _, _, _ = calculate_h2ovl_targets( + orig_width=width, + orig_height=height, + target_ratios=get_h2ovl_target_ratios( + min_num, + max_num, + prior_aspect_ratio=aspect_ratio, + ), + image_size=config.vision_config.image_size, + use_thumbnail=False, + ) + + # Add thumbnail if use_thumbnail is True and total_blocks > 1 + if config.use_thumbnail: + blocks1 += 1 if blocks1 > 1 else 0 + blocks2 += 1 if blocks2 > 1 else 0 + + # Total blocks is the sum of blocks from both passes minus + # overlapping + total_blocks = blocks1 + blocks2 - 1 + + expected_num_patches = total_blocks + else: + blocks, _, _, _ = calculate_h2ovl_targets( + orig_width=width, + orig_height=height, + target_ratios=get_h2ovl_target_ratios( + min_num, + max_num, + prior_aspect_ratio=None, + ), + image_size=config.vision_config.image_size, + use_thumbnail=False, + ) + expected_num_patches = blocks + + if config.use_thumbnail and expected_num_patches != 1: + expected_num_patches += 1 + + processed_inputs = processor.apply(prompt, mm_data, + mm_processor_kwargs) + pixel_shape = ( + processed_inputs["mm_kwargs"]["pixel_values_flat"].shape) + + assert pixel_shape[0] == expected_num_patches * num_imgs diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index 0d921e9d3..ede961225 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -1,207 +1,64 @@ # SPDX-License-Identifier: Apache-2.0 """Tests for InternVL's multimodal preprocessing kwargs.""" -from typing import Callable, Optional +from typing import Optional import pytest -from transformers import AutoTokenizer -from vllm.inputs import InputContext, token_inputs -from vllm.multimodal import MultiModalRegistry +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.utils import cached_get_tokenizer from ....conftest import _ImageAssets from ...utils import build_model_context -models = ["OpenGVLab/InternVL2-2B"] - -# Wrap lazy imports to avoid initializing CUDA during test collection -@pytest.fixture() -def input_processor_for_internvl(): - from vllm.model_executor.models.internvl import InternVLInputPipeline - - pipeline = InternVLInputPipeline('', '', '') - return pipeline.input_processor - - -@pytest.fixture() -def dummy_data_for_internvl(): - from vllm.model_executor.models.internvl import InternVLInputPipeline - - pipeline = InternVLInputPipeline('', '', '') - return pipeline.dummy_data - - -@pytest.fixture() -def get_max_internvl_image_tokens(): - from vllm.model_executor.models.internvl import ( - get_max_internvl_image_tokens) - return get_max_internvl_image_tokens - - -@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("model_id", ["OpenGVLab/InternVL2-2B"]) @pytest.mark.parametrize("max_dynamic_patch", [1, 4]) @pytest.mark.parametrize("dynamic_image_size", [True, False, None]) -def test_input_mapper_override( - model: str, +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_override( + model_id: str, image_assets: _ImageAssets, max_dynamic_patch: int, dynamic_image_size: Optional[bool], -): - mm_processor_kwargs = { - "max_dynamic_patch": max_dynamic_patch, - } - if dynamic_image_size is not None: - mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size - - expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 - if dynamic_image_size is False: - expected_num_patches = 1 - - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - mm_processor_kwargs=mm_processor_kwargs, - ) - - mm_registry = MultiModalRegistry() - mm_registry.init_mm_limits_per_prompt(ctx.model_config) - - image = image_assets[0].pil_image.resize((448 * 2, 448 * 2)) - vllm_result = mm_registry.map_input( - ctx.model_config, - {"image": image}, - ) - assert vllm_result["pixel_values"].size(1) == expected_num_patches - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None]) -@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) -def test_max_tokens_override( - get_max_internvl_image_tokens: Callable, - model: str, - max_dynamic_patch: Optional[int], - dynamic_image_size: Optional[bool], -): - """Ensure get_max_internvl_image_tokens handles mm_processor_kwargs.""" - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - mm_processor_kwargs=None, - ) - - if max_dynamic_patch is None: - max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch - expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 - if dynamic_image_size is False: - expected_num_patches = 1 - expected_max_tokens = 256 * expected_num_patches - - actual_max_tokens = get_max_internvl_image_tokens( - ctx=InputContext(ctx.model_config), - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - ) - assert expected_max_tokens == actual_max_tokens - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("num_imgs", [1, 2]) -@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None]) -@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) -def test_dummy_data_override( - dummy_data_for_internvl: Callable, - model: str, num_imgs: int, - max_dynamic_patch: Optional[int], - dynamic_image_size: Optional[bool], ): - """Ensure dummy_data_for_internvl handles kwargs properly.""" - # Same as the previous test - don't initialize mm_processor_kwargs - # in this test and assume that the kwargs will be correctly expanded by - # the partial when calling the dummy data func. ctx = build_model_context( - model_name=model, - tokenizer_name=model, + model_name=model_id, + tokenizer_name=model_id, trust_remote_code=True, mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, ) - - if max_dynamic_patch is None: - max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch - expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 - if dynamic_image_size is False: - expected_num_patches = 1 - expected_max_tokens = 256 * expected_num_patches - - dummy_data = dummy_data_for_internvl( - ctx=ctx, - seq_len=8192, # Should be bigger than num_imgs * toks_per_img - mm_counts={"image": num_imgs}, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, + tokenizer = cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=tokenizer, ) - sequence_data = dummy_data.seq_data - - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) - image_token_id = tokenizer.encode('', - add_special_tokens=False)[0] - # Ensure we have the right number of placeholders per size - img_tok_count = sequence_data.get_token_ids().count(image_token_id) - assert img_tok_count == expected_max_tokens * num_imgs + mm_processor_kwargs = { + "max_dynamic_patch": max_dynamic_patch, + } + if dynamic_image_size is not None: + mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size + # Build the image str / prompt based on the number of images we pass + prompt = "" * num_imgs + image = image_assets[0].pil_image.resize((448 * 2, 448 * 2)) + mm_data = {"image": [image] * num_imgs} -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("max_dynamic_patch", [1, 4]) -@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) -@pytest.mark.parametrize("num_imgs", [1, 2]) -def test_input_processor_override( - input_processor_for_internvl: Callable, - image_assets: _ImageAssets, - model: str, - num_imgs: int, - max_dynamic_patch: int, - dynamic_image_size: Optional[bool], -): - """Ensure input_processor_for_internvl handles kwargs properly.""" - # Same as the previous test - don't initialize mm_processor_kwargs - # in this test and assume that the kwargs will be correctly expanded by - # the partial when calling the custom input processor. expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 if dynamic_image_size is False: expected_num_patches = 1 - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - mm_processor_kwargs=None, - ) - expected_toks_per_img = 256 * expected_num_patches - - # Build the image str / prompt based on the number of images we pass - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) - placeholders = "" if num_imgs == 1 else "\n".join( - f"Image-{i}: \n" for i in range(1, num_imgs + 1)) - prompt = placeholders - images = [image_assets[0].pil_image.resize((448 * 2, 448 * 2))] * num_imgs - - inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), - prompt=prompt, - multi_modal_data={"image": images}) - - processed_inputs = input_processor_for_internvl( - ctx, - inputs, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - ) + processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) # Ensure we have the right number of placeholders per num_crops size - image_token_id = tokenizer.encode('', - add_special_tokens=False)[0] + image_token_id = tokenizer.convert_tokens_to_ids("") img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) - assert img_tok_count == expected_toks_per_img * num_imgs + pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape + + assert img_tok_count == 256 * expected_num_patches * num_imgs + assert pixel_shape[0] == expected_num_patches * num_imgs diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py index d2497e62d..fe4754c2e 100644 --- a/tests/models/multimodal/processing/test_llava_next.py +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -43,7 +43,10 @@ def test_processor_max_tokens(model_id): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + tokenizer=cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ), ) info = processor.info @@ -143,7 +146,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + tokenizer=cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ), ) image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), @@ -173,7 +179,10 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + tokenizer=cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ), ) seen_aspect_ratios = set[float]() diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py index bd4dbd46d..fb650d9e0 100644 --- a/tests/models/multimodal/processing/test_llava_onevision.py +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -44,7 +44,10 @@ def test_processor_max_tokens(model_id): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + tokenizer=cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ), ) info = processor.info @@ -143,7 +146,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + tokenizer=cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ), ) image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), @@ -174,7 +180,10 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + tokenizer=cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ), ) seen_aspect_ratios = set[float]() diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py index 44edec457..dde8904f2 100644 --- a/tests/models/multimodal/processing/test_phi3v.py +++ b/tests/models/multimodal/processing/test_phi3v.py @@ -38,7 +38,10 @@ def test_processor_override( trust_remote_code=True, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) + tokenizer = cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, tokenizer=tokenizer, diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index 47c9b0add..ef8e97f82 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -33,7 +33,10 @@ def test_processor_override( mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) + tokenizer = cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, tokenizer=tokenizer, diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 97502c38b..98df532aa 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -399,7 +399,11 @@ class AriaProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: return {"image": self.get_num_image_tokens()} def get_num_image_tokens(self) -> int: diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 2b0452222..0463a0b97 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -407,7 +407,11 @@ class Blip2ProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: return {"image": self.get_num_image_tokens()} def get_num_image_tokens(self) -> int: diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 9061a3128..b29dd65a8 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -64,7 +64,11 @@ class ChameleonProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: return {"image": self.get_num_image_tokens()} def get_num_image_tokens(self) -> int: diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 1343b9762..0eaf3a620 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -165,7 +165,11 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo): image_width=x[1], image_height=x[0])) return ImageSize(width=width, height=height) - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: max_image_size = self.get_image_size_with_most_features() max_image_tokens = self.get_num_image_tokens( image_height=max_image_size.height, diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 6d8c82968..50b5ef35d 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -80,7 +80,11 @@ class FuyuProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: target_width, target_height = self.get_image_size_with_most_features() max_ncols, max_nrows = self.get_image_feature_grid_size( diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 91c89b159..cf3e777a2 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -7,43 +7,55 @@ # Copyright (c) 2024 H2O.AI # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- -from functools import partial -from typing import List, Optional, Tuple +from typing import Mapping, Optional import torch from PIL import Image from transformers import PretrainedConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.utils import cached_get_tokenizer -from vllm.utils import is_list_of +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalKwargs +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + MultiModalDataItems) +from vllm.multimodal.processing import (ProcessingCache, PromptReplacement, + PromptReplacementDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.transformers_utils.tokenizer import AnyTokenizer from .intern_vit import InternVisionModel -from .internvl import (IMG_CONTEXT, IMG_END, IMG_START, InternVLChatModel, - InternVLInputPipeline, build_transform, - find_closest_aspect_ratio, get_internvl_num_patches) +from .internvl import (IMG_CONTEXT, IMG_END, IMG_START, + BaseInternVLProcessingInfo, BaseInternVLProcessor, + InternVLChatModel, InternVLDummyInputsBuilder, + InternVLMultiModalProcessor, build_transform, + find_closest_aspect_ratio, get_internvl_target_ratios) +logger = init_logger(__name__) -# modified to include blocks generated in second pass -def calculate_num_blocks( - orig_width: int, - orig_height: int, - min_num: int, - max_num: int, - image_size: int, + +def resolve_h2ovl_min_max_num( + *, + min_dynamic_patch: int, + max_dynamic_patch: int, + dynamic_image_size: bool, use_thumbnail: bool, - prior_aspect_ratio=None, -) -> Tuple[int, int, int, Tuple[int, int]]: - aspect_ratio = orig_width / orig_height +) -> tuple[int, int]: + max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 + + if use_thumbnail and max_dynamic_patch != 1: + max_dynamic_patch += 1 - # calculate the existing image aspect ratio - target_ratios = set((i, j) for n in range(min_num, max_num + 1) - for i in range(1, n + 1) for j in range(1, n + 1) - if i * j <= max_num and i * j >= min_num) - target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + return min_dynamic_patch, max_dynamic_patch + + +def get_h2ovl_target_ratios( + min_num: int, + max_num: int, + *, + prior_aspect_ratio: Optional[tuple[int, int]], +) -> list[tuple[int, int]]: + target_ratios = get_internvl_target_ratios(min_num, max_num) # if prior_aspect_ratio is provided, filter the target ratios if prior_aspect_ratio is not None: @@ -52,44 +64,66 @@ def calculate_num_blocks( ratio[0] != 0 and prior_aspect_ratio[1] % ratio[1] != 0 ] + return target_ratios + + +# modified to include blocks generated in second pass +def calculate_h2ovl_targets( + *, + orig_width: int, + orig_height: int, + target_ratios: list[tuple[int, int]], + image_size: int, + use_thumbnail: bool, +) -> tuple[int, int, int, tuple[int, int]]: + aspect_ratio = orig_width / orig_height + # find the closest aspect ratio to the target - target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, - target_ratios, orig_width, - orig_height, image_size) + target_aspect_ratio = find_closest_aspect_ratio( + aspect_ratio, + target_ratios, + width=orig_width, + height=orig_height, + image_size=image_size, + ) # calculate the target width and height target_width = image_size * target_aspect_ratio[0] target_height = image_size * target_aspect_ratio[1] blocks = target_aspect_ratio[0] * target_aspect_ratio[1] - # add thumbnail image if num_blocks > 1 - if use_thumbnail and blocks > 1: + + # add thumbnail image if num_blocks != 1 + if use_thumbnail and blocks != 1: blocks += 1 + return blocks, target_width, target_height, target_aspect_ratio # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B -# refactored to handle prior_aspect_ratio as optional -def dynamic_preprocess( +# refactored to handle prior_aspect_ratio +def dynamic_preprocess_h2ovl( image: Image.Image, - min_num: int, - max_num: int, + *, + target_ratios: list[tuple[int, int]], image_size: int, use_thumbnail: bool, - prior_aspect_ratio: Optional[Tuple[int, int]] = None, -) -> Tuple[List[Image.Image], Tuple[int, int]]: +) -> tuple[list[Image.Image], tuple[int, int]]: orig_width, orig_height = image.size - # calculate the number of blocks based on prior aspect ratio if available - blocks, target_width, target_height, target_aspect_ratio = ( - calculate_num_blocks( - orig_width, - orig_height, - min_num, - max_num, - image_size, - use_thumbnail=False, - prior_aspect_ratio=prior_aspect_ratio, - )) + # calculate the number of blocks without thumbnail + ( + blocks, + target_width, + target_height, + target_aspect_ratio, + ) = calculate_h2ovl_targets( + orig_width=orig_width, + orig_height=orig_height, + target_ratios=target_ratios, + image_size=image_size, + use_thumbnail=False, + ) + # resize the image resized_img = image.resize((target_width, target_height)) processed_images = [] @@ -103,276 +137,393 @@ def dynamic_preprocess( # split the image split_img = resized_img.crop(box) processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: thumbnail_img = image.resize((image_size, image_size)) processed_images.append(thumbnail_img) + return processed_images, target_aspect_ratio -def load_image( +def _preprocess_image( image: Image.Image, - input_size=448, - min_num=1, - max_num=6, - use_thumbnail=True, - prior_aspect_ratio: Optional[Tuple[int, int]] = None, -) -> Tuple[torch.Tensor, Tuple[int, int]]: + *, + input_size: int, + min_num: int, + max_num: int, + use_thumbnail: bool, + prior_aspect_ratio: Optional[tuple[int, int]], +) -> tuple[torch.Tensor, tuple[int, int]]: + target_ratios = get_h2ovl_target_ratios( + min_num, + max_num, + prior_aspect_ratio=prior_aspect_ratio, + ) + transform = build_transform(input_size=input_size) - images, target_aspect_ratio = dynamic_preprocess( + images, target_aspect_ratio = dynamic_preprocess_h2ovl( image, image_size=input_size, use_thumbnail=use_thumbnail, - min_num=min_num, - max_num=max_num, - prior_aspect_ratio=prior_aspect_ratio, + target_ratios=target_ratios, ) - pixel_values = [transform(image) for image in images] - pixel_values = torch.stack(pixel_values) + + pixel_values = torch.stack([transform(image) for image in images]) return pixel_values, target_aspect_ratio -# refactored to use the combined load_image function -def image_to_pixel_values( +# refactored to use the _preprocess_image function +def image_to_pixel_values_h2ovl( image: Image.Image, + *, input_size: int, min_num: int, max_num: int, use_thumbnail: bool, - use_MSAC: bool, + use_msac: bool, ) -> torch.Tensor: # when MSAC is turned on, we need to process the image twice - if use_MSAC: + if use_msac: # first pass - pixel_values, target_aspect_ratio = load_image( + pixel_values1, aspect_ratio1 = _preprocess_image( image, input_size=input_size, min_num=min_num, max_num=max_num, use_thumbnail=True, + prior_aspect_ratio=None, ) # second pass - pixel_values2, _ = load_image( + pixel_values2, _ = _preprocess_image( image, input_size=input_size, - min_num=min_num, + min_num=3, # Hardcoded value max_num=max_num, - prior_aspect_ratio=target_aspect_ratio, + use_thumbnail=True, + prior_aspect_ratio=aspect_ratio1, ) # combine pixel values pixel_values = torch.cat( - [pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0) + [pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0) else: - pixel_values, _ = load_image( + pixel_values, _ = _preprocess_image( image, input_size=input_size, min_num=min_num, max_num=max_num, use_thumbnail=use_thumbnail, + prior_aspect_ratio=None, ) return pixel_values -def image_to_pixel_values_wrapper(hf_config: PretrainedConfig, - max_dynamic_patch: Optional[int] = None, - use_MSAC: Optional[bool] = None): - image_size = hf_config.vision_config.image_size - min_num = hf_config.min_dynamic_patch - if max_dynamic_patch is None: - max_dynamic_patch = hf_config.max_dynamic_patch - if use_MSAC is None: - use_MSAC = hf_config.use_msac - use_thumbnail = hf_config.use_thumbnail - return partial( - image_to_pixel_values, - input_size=image_size, - min_num=min_num, - max_num=max_dynamic_patch, - use_thumbnail=use_thumbnail, - use_MSAC=use_MSAC, - ) - - -def get_max_internvl_image_tokens(ctx: InputContext, - *, - max_dynamic_patch: Optional[int] = None): - """ - Calculate the maximum number of tokens with/without MSAC and thumbnail - """ - hf_config = ctx.get_hf_config() - use_thumbnail = hf_config.use_thumbnail - use_MSAC = hf_config.use_msac +class H2OVLProcessor(BaseInternVLProcessor): - if max_dynamic_patch is None: - max_dynamic_patch = hf_config.max_dynamic_patch + def __init__( + self, + config: PretrainedConfig, + tokenizer: AnyTokenizer, + *, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + use_msac: Optional[bool] = None, + ) -> None: + super().__init__( + config, + tokenizer, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) - num_patches = get_internvl_num_patches(hf_config) + if use_msac is None: + use_msac = config.use_msac + assert isinstance(use_msac, bool) - coefficient = 2 if use_MSAC else 1 - num_blocks = coefficient * max_dynamic_patch + (1 if use_thumbnail else 0) + self.use_msac = use_msac - return num_blocks * num_patches + @property + def image_token_id(self) -> int: + return self.tokenizer.get_vocab()[IMG_CONTEXT] + def get_image_repl_features( + self, + feature_size: int, + num_patches: Optional[int], + ) -> str: + return IMG_CONTEXT * feature_size -class H2OVLInputPipeline(InternVLInputPipeline): - """ - Input pipeline for processing image and text data for the H2OVL model. - """ + def get_image_repl_full( + self, + feature_size: int, + num_patches: Optional[int], + ) -> str: + features = self.get_image_repl_features(feature_size, num_patches) + return IMG_START + features + IMG_END - def input_processor( + def resolve_min_max_num( self, - ctx: InputContext, - inputs: DecoderOnlyInputs, *, max_dynamic_patch: Optional[int] = None, - ) -> DecoderOnlyInputs: - # get multi_modal_data - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - model_config = ctx.model_config - hf_config = ctx.get_hf_config() - use_MSAC = hf_config.use_msac - - image_data = multi_modal_data["image"] - num_patches = get_internvl_num_patches(hf_config) - - image_pixel_values_mapper = image_to_pixel_values_wrapper( - hf_config, max_dynamic_patch=max_dynamic_patch) - - # single image - if isinstance(image_data, Image.Image): - pixel_values = image_pixel_values_mapper(image_data, - use_MSAC=use_MSAC) - num_blocks = pixel_values.shape[0] - image_feature_sizes = [num_blocks * num_patches] - pixel_values = pixel_values.unsqueeze(0) - - # multi images - elif is_list_of(image_data, Image.Image): - # Do not use MSAC for multi images - image_feature_sizes = [] - pixel_values = [ - image_pixel_values_mapper(image, use_MSAC=False) - for image in image_data - ] - for pixel_value in pixel_values: - num_blocks = pixel_value.shape[0] - image_feature_sizes.append(num_blocks * num_patches) - - # image embeddings as input - elif isinstance(image_data, torch.Tensor): - _, image_feature_size, _ = image_data.shape - image_feature_sizes = [image_feature_size] - pixel_values = None - - # multi-image image embeddings - elif is_list_of(image_data, torch.Tensor): - - image_feature_sizes = [] - for image_embed in image_data: - _, image_feature_size, _ = image_embed.shape - image_feature_sizes.append(image_feature_size) - pixel_values = None + dynamic_image_size: Optional[bool] = None, + use_thumbnail: Optional[bool] = None, + ) -> tuple[int, int]: + min_dynamic_patch = self.min_dynamic_patch + max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch + is None else max_dynamic_patch) + dynamic_image_size = (self.dynamic_image_size if dynamic_image_size + is None else dynamic_image_size) + use_thumbnail = (self.use_thumbnail + if use_thumbnail is None else use_thumbnail) + + return resolve_h2ovl_min_max_num( + min_dynamic_patch=min_dynamic_patch, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + use_thumbnail=use_thumbnail, + ) - else: - raise TypeError(f"Invalid image type: {type(image_data)}") + def resolve_target_ratios( + self, + *, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + use_thumbnail: Optional[bool] = None, + prior_aspect_ratio: Optional[tuple[int, int]] = None, + ) -> list[tuple[int, int]]: + min_num, max_num = self.resolve_min_max_num( + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + use_thumbnail=use_thumbnail, + ) + if prior_aspect_ratio: # hardcoded value for second pass of use_msac + min_num = 3 - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code, + return get_h2ovl_target_ratios( + min_num, + max_num, + prior_aspect_ratio=prior_aspect_ratio, ) - prompt = inputs.get("prompt") - prompt_token_ids = inputs["prompt_token_ids"] - if prompt is None: - prompt = tokenizer.decode(prompt_token_ids) - - new_prompt = self._expand_image_prompt(prompt, image_feature_sizes, - num_patches) - new_prompt_token_ids = tokenizer.encode(new_prompt) - - # Wrap image processing in input_processor to avoid duplication - image_token_id = tokenizer.encode( - self.img_context_token, - add_special_tokens=False, - return_tensors="pt", - )[0] - - # Update multi_modal_data to return - if pixel_values is not None: - multi_modal_data = { - "image": { - "pixel_values": pixel_values, - "image_token_id": image_token_id, - } - } + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + use_msac: Optional[bool] = None, + ) -> int: + use_msac = (self.use_msac if use_msac is None else use_msac) + + use_thumbnail = self.use_thumbnail + + if use_msac: + target_ratios_1 = self.resolve_target_ratios( + use_thumbnail=False, # Applied in calculate_targets + ) + num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets( + orig_width=image_width, + orig_height=image_height, + image_size=self.image_size, + target_ratios=target_ratios_1, + use_thumbnail=True, + ) + + target_ratios_2 = self.resolve_target_ratios( + use_thumbnail=False, # Applied in calculate_targets + prior_aspect_ratio=aspect_ratio_1, + ) + num_patches_2, _, _, _ = calculate_h2ovl_targets( + orig_width=image_width, + orig_height=image_height, + image_size=self.image_size, + target_ratios=target_ratios_2, + use_thumbnail=True, + ) + + num_patches = num_patches_1 + num_patches_2 - 1 else: - multi_modal_data = {"image": {"image_embeds": image_data}} + target_ratios = self.resolve_target_ratios( + use_thumbnail=False, # Applied in calculate_targets + ) + num_patches, _, _, _ = calculate_h2ovl_targets( + orig_width=image_width, + orig_height=image_height, + image_size=self.image_size, + target_ratios=target_ratios, + use_thumbnail=use_thumbnail, + ) + + return num_patches * self.num_image_token - return token_inputs( - prompt=prompt, - prompt_token_ids=new_prompt_token_ids, - multi_modal_data=multi_modal_data, + def _images_to_pixel_values_lst( + self, + images: list[Image.Image], + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + ) -> list[torch.Tensor]: + use_msac = self.use_msac if len(images) == 1 else False + + min_num, max_num = self.resolve_min_max_num( + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + use_thumbnail=False, # Applied in image_to_pixel_values ) - def input_mapper( + return [ + image_to_pixel_values_h2ovl( + image, + input_size=self.image_size, + min_num=min_num, + max_num=max_num, + use_thumbnail=self.use_thumbnail, + use_msac=use_msac, + ) for image in images + ] + + +class H2OVLProcessingInfo(BaseInternVLProcessingInfo): + + def get_hf_processor( self, - ctx: InputContext, - data: object, *, max_dynamic_patch: Optional[int] = None, - ) -> MultiModalKwargs: + dynamic_image_size: Optional[bool] = None, + ) -> H2OVLProcessor: + return H2OVLProcessor( + self.get_hf_config(), + self.get_tokenizer(), + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + max_tokens_one_image = self.get_max_image_tokens(use_msac=None) + if mm_counts.get("image", 0) <= 1: + max_tokens_per_image = max_tokens_one_image + else: + max_tokens_per_image = self.get_max_image_tokens(use_msac=False) + + return {"image": max_tokens_per_image} - # NOTE: Preprocessing for the image data is done in the - # 'input_processor' function during actual inference. - if isinstance(data, dict): - return MultiModalKwargs(data) + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + processor: Optional[H2OVLProcessor], + use_msac: Optional[bool] = None, + ) -> int: + if processor is None: + processor = self.get_hf_processor() + + return processor.get_num_image_tokens( + image_width=image_width, + image_height=image_height, + use_msac=use_msac, + ) - # The section below is only used with dummy data during - # memory profiling. - hf_config = ctx.get_hf_config() + def get_max_image_tokens(self, use_msac: Optional[bool] = None) -> int: + target_width, target_height = self.get_image_size_with_most_features() - image_pixel_values_mapper = image_to_pixel_values_wrapper( - hf_config, max_dynamic_patch) + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + processor=None, + use_msac=use_msac, + ) - if isinstance(data, Image.Image): - pixel_values = image_pixel_values_mapper(data) - pixel_values = pixel_values.unsqueeze(0) - elif is_list_of(data, Image.Image): - hf_config.use_msac = False - pixel_values = [image_pixel_values_mapper(img) for img in data] +class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo] + ): + + def __init__(self, + info: H2OVLProcessingInfo, + dummy_inputs: "BaseDummyInputsBuilder[H2OVLProcessingInfo]", + *, + cache: Optional[ProcessingCache] = None, + enable_sanity_checks: bool = True) -> None: + super().__init__( + info, + dummy_inputs, + cache=cache, + enable_sanity_checks=enable_sanity_checks, + ) + + if self.cache is not None: + # The processor output depends on the number of images passed, + # making it incompatible with processing cache which is supposed + # to be invariant of how many images are passed per prompt + self.cache = None + logger.warning_once( + f"{type(self).__name__} does not support processing cache.") + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + if "image_num_patches" in out_mm_kwargs: + image_num_patches = out_mm_kwargs["image_num_patches"] + assert isinstance(image_num_patches, torch.Tensor) + image_num_patches = image_num_patches.tolist() + elif "image_embeds" in out_mm_kwargs: + # TODO: Use image size information in dictionary embedding inputs + # to compute num_patches (similar to Qwen2-VL) + image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) else: - return MultiModalKwargs({"image_embeds": data}) - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code, - ) - image_token_id = tokenizer.encode( - self.img_context_token, - add_special_tokens=False, - return_tensors="pt", - )[0] + image_num_patches = [] + + num_images = len(image_num_patches) - return MultiModalKwargs({ - "pixel_values": pixel_values, - "image_token_id": image_token_id - }) + def get_replacement_internvl(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + if isinstance(images, ImageEmbeddingItems): + feature_size = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + feature_size = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + processor=hf_processor, + use_msac=None if num_images == 1 else False, + ) + + num_patches = image_num_patches[item_idx] + if num_patches is not None: + assert isinstance(num_patches, int) + + return PromptReplacementDetails( + full=hf_processor.get_image_repl_full(feature_size, + num_patches), + features=hf_processor.get_image_repl_features( + feature_size, num_patches), + ) -input_pipeline = H2OVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT) + return [ + PromptReplacement( + modality="image", + target="", + replacement=get_replacement_internvl, + ) + ] -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens) -@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data) -@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor) +@MULTIMODAL_REGISTRY.register_processor( + H2OVLMultiModalProcessor, + info=H2OVLProcessingInfo, + dummy_inputs=InternVLDummyInputsBuilder) class H2OVLChatModel(InternVLChatModel): def _init_vision_model( diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index c46a867a7..08fc659ab 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -6,35 +6,37 @@ # Copyright (c) 2023 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- -import re -from functools import cached_property, partial +from abc import ABC, abstractmethod +from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) + TypedDict, TypeVar, Union) import torch import torch.nn as nn import torchvision.transforms as T from PIL import Image -from transformers import PretrainedConfig +from transformers import BatchFeature, PretrainedConfig, TensorType from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.inputs import NestedTensors, PlaceholderRange -from vllm.multimodal.utils import cached_get_tokenizer +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize, MultiModalDataItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from vllm.utils import is_list_of +from vllm.transformers_utils.tokenizer import AnyTokenizer -from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip, - get_clip_num_patches) from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -75,22 +77,27 @@ InternVLImageInputs = Union[InternVLImagePixelInputs, InternVLImageEmbeddingInputs] -# copied from https://huggingface.co/OpenGVLab/InternVL2-1B -def build_transform(input_size): +# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B +def build_transform(input_size: int): MEAN, STD = IMAGENET_MEAN, IMAGENET_STD - transform = T.Compose([ + return T.Compose([ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), T.Resize((input_size, input_size), interpolation=T.InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(mean=MEAN, std=STD) ]) - return transform -# copied from https://huggingface.co/OpenGVLab/InternVL2-1B -def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, - image_size): +# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B +def find_closest_aspect_ratio( + aspect_ratio: float, + target_ratios: list[tuple[int, int]], + *, + width: int, + height: int, + image_size: int, +) -> tuple[int, int]: best_ratio_diff = float('inf') best_ratio = (1, 1) area = width * height @@ -106,67 +113,82 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, return best_ratio -def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int, - max_num: int, image_size: int, - use_thumbnail: bool) -> Tuple[int, int, int]: - aspect_ratio = orig_width / orig_height +def resolve_internvl_min_max_num( + *, + min_dynamic_patch: int, + max_dynamic_patch: int, + dynamic_image_size: bool, + use_thumbnail: bool, +) -> tuple[int, int]: + max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 + + if use_thumbnail and max_dynamic_patch != 1: + max_dynamic_patch += 1 + + return min_dynamic_patch, max_dynamic_patch + - # calculate the existing image aspect ratio - target_ratios = set((i, j) for n in range(min_num, max_num + 1) - for i in range(1, n + 1) for j in range(1, n + 1) - if i * j <= max_num and i * j >= min_num) - target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) +def get_internvl_target_ratios( + min_num: int, + max_num: int, +) -> list[tuple[int, int]]: + target_ratios = {(i, j) + for n in range(min_num, max_num + 1) + for i in range(1, n + 1) + for j in range(1, n + 1) if min_num <= i * j <= max_num} + return sorted(target_ratios, key=lambda x: x[0] * x[1]) + + +def calculate_internvl_targets( + *, + orig_width: int, + orig_height: int, + target_ratios: list[tuple[int, int]], + image_size: int, + use_thumbnail: bool, +) -> tuple[int, int, int]: + aspect_ratio = orig_width / orig_height # find the closest aspect ratio to the target - target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, - target_ratios, orig_width, - orig_height, image_size) + target_aspect_ratio = find_closest_aspect_ratio( + aspect_ratio, + target_ratios, + width=orig_width, + height=orig_height, + image_size=image_size, + ) # calculate the target width and height target_width = image_size * target_aspect_ratio[0] target_height = image_size * target_aspect_ratio[1] blocks = target_aspect_ratio[0] * target_aspect_ratio[1] - # add thumbnail image if num_blocks > 1 - if use_thumbnail and blocks > 1: - blocks += 1 - return blocks, target_width, target_height - -def calculate_num_blocks_wrapper( - hf_config: PretrainedConfig, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, -): - if dynamic_image_size is None: - dynamic_image_size = hf_config.dynamic_image_size + # add thumbnail image if num_blocks != 1 + if use_thumbnail and blocks != 1: + blocks += 1 - max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 - if max_dynamic_patch is None: - max_dynamic_patch = hf_config.max_dynamic_patch - min_num = hf_config.min_dynamic_patch - image_size = hf_config.vision_config.image_size - use_thumbnail = hf_config.use_thumbnail - return partial(calculate_num_blocks, - min_num=min_num, - max_num=max_dynamic_patch, - image_size=image_size, - use_thumbnail=use_thumbnail) + return blocks, target_width, target_height # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B -def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int, - image_size: int, - use_thumbnail: bool) -> List[Image.Image]: +def dynamic_preprocess_internvl( + image: Image.Image, + *, + target_ratios: list[tuple[int, int]], + image_size: int, + use_thumbnail: bool, +) -> list[Image.Image]: orig_width, orig_height = image.size # calculate the number of blocks without thumbnail - blocks, target_width, target_height = calculate_num_blocks( - orig_width, - orig_height, - min_num, - max_num, - image_size, - use_thumbnail=False) + blocks, target_width, target_height = calculate_internvl_targets( + orig_width=orig_width, + orig_height=orig_height, + target_ratios=target_ratios, + image_size=image_size, + use_thumbnail=False, + ) + # resize the image resized_img = image.resize((target_width, target_height)) processed_images = [] @@ -178,301 +200,463 @@ def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int, # split the image split_img = resized_img.crop(box) processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: thumbnail_img = image.resize((image_size, image_size)) processed_images.append(thumbnail_img) + return processed_images # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B -def image_to_pixel_values(image: Image.Image, input_size: int, min_num: int, - max_num: int, use_thumbnail: bool) -> torch.Tensor: +def image_to_pixel_values_internvl( + image: Image.Image, + *, + input_size: int, + min_num: int, + max_num: int, + use_thumbnail: bool, +) -> torch.Tensor: + target_ratios = get_internvl_target_ratios(min_num, max_num) + transform = build_transform(input_size=input_size) - images = dynamic_preprocess(image, - min_num=min_num, - max_num=max_num, - image_size=input_size, - use_thumbnail=use_thumbnail) - pixel_values = [transform(image) for image in images] - pixel_values = torch.stack(pixel_values) + images = dynamic_preprocess_internvl( + image, + target_ratios=target_ratios, + image_size=input_size, + use_thumbnail=use_thumbnail, + ) + + pixel_values = torch.stack([transform(image) for image in images]) return pixel_values -def image_to_pixel_values_wrapper( - hf_config: PretrainedConfig, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, -): - image_size = hf_config.vision_config.image_size - min_num = hf_config.min_dynamic_patch - if dynamic_image_size is None: - dynamic_image_size = hf_config.dynamic_image_size +class BaseInternVLProcessor(ABC): + """ + This model doesn't define its own HF processor, + so we implement our own one here. - max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 - if max_dynamic_patch is None: - max_dynamic_patch = hf_config.max_dynamic_patch - use_thumbnail = hf_config.use_thumbnail - return partial(image_to_pixel_values, - input_size=image_size, - min_num=min_num, - max_num=max_dynamic_patch, - use_thumbnail=use_thumbnail) - - -def get_internvl_num_patches(hf_config: PretrainedConfig): - vision_config = hf_config.vision_config - downsample_ratio = hf_config.downsample_ratio - image_size = vision_config.image_size - patch_size = vision_config.patch_size - return int( - get_clip_num_patches(image_size=image_size, patch_size=patch_size) * - (downsample_ratio**2)) - - -def get_max_internvl_image_tokens( - ctx: InputContext, - *, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, -): - hf_config = ctx.get_hf_config() - if dynamic_image_size is None: - dynamic_image_size = hf_config.dynamic_image_size + The code to insert image tokens is based on: + https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252 + """ - max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 - if max_dynamic_patch is None: - max_dynamic_patch = hf_config.max_dynamic_patch - use_thumbnail = hf_config.use_thumbnail - if use_thumbnail and max_dynamic_patch > 1: - max_dynamic_patch += 1 + def __init__( + self, + config: PretrainedConfig, + tokenizer: AnyTokenizer, + *, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + ) -> None: + super().__init__() - num_patches = get_internvl_num_patches(hf_config) - return num_patches * max_dynamic_patch + self.config = config + self.tokenizer = tokenizer + image_size: int = config.vision_config.image_size + patch_size: int = config.vision_config.patch_size -def get_max_internvl_image_size( - ctx: InputContext, - *, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, -): - hf_config = ctx.get_hf_config() - image_size = hf_config.vision_config.image_size - if dynamic_image_size is None: - dynamic_image_size = hf_config.dynamic_image_size + if dynamic_image_size is None: + dynamic_image_size = config.dynamic_image_size + assert isinstance(dynamic_image_size, bool) - max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 - if max_dynamic_patch is None: - max_dynamic_patch = hf_config.max_dynamic_patch - use_thumbnail = hf_config.use_thumbnail - if use_thumbnail and max_dynamic_patch > 1: - max_dynamic_patch += 1 - width = image_size * max_dynamic_patch - height = image_size - return width, height + if max_dynamic_patch is None: + max_dynamic_patch = config.max_dynamic_patch + assert isinstance(max_dynamic_patch, int) + self.num_image_token = int( + (image_size // patch_size)**2 * (config.downsample_ratio**2)) + self.image_size = image_size + self.min_dynamic_patch: int = config.min_dynamic_patch + self.max_dynamic_patch = max_dynamic_patch + self.dynamic_image_size = dynamic_image_size + self.use_thumbnail: bool = config.use_thumbnail + + @property + @abstractmethod + def image_token_id(self) -> int: + raise NotImplementedError + + @abstractmethod + def get_image_repl_features( + self, + feature_size: int, + num_patches: Optional[int], + ) -> str: + raise NotImplementedError -class InternVLInputPipeline: + @abstractmethod + def get_image_repl_full( + self, + feature_size: int, + num_patches: Optional[int], + ) -> str: + raise NotImplementedError - def __init__( + def resolve_min_max_num( self, - img_start_token: str, - img_end_token: str, - img_context_token: str, - ) -> None: - super().__init__() + *, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + use_thumbnail: Optional[bool] = None, + ) -> tuple[int, int]: + min_dynamic_patch = self.min_dynamic_patch + max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch + is None else max_dynamic_patch) + dynamic_image_size = (self.dynamic_image_size if dynamic_image_size + is None else dynamic_image_size) + use_thumbnail = (self.use_thumbnail + if use_thumbnail is None else use_thumbnail) + + return resolve_internvl_min_max_num( + min_dynamic_patch=min_dynamic_patch, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + use_thumbnail=use_thumbnail, + ) - self.img_start_token = img_start_token - self.img_end_token = img_end_token - self.img_context_token = img_context_token + def resolve_target_ratios( + self, + *, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + use_thumbnail: Optional[bool] = None, + ) -> list[tuple[int, int]]: + min_num, max_num = self.resolve_min_max_num( + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + use_thumbnail=use_thumbnail, + ) - def _create_image_prompt(self, feature_size: int, num_patches: int) -> str: - return (self.img_start_token + self.img_context_token * feature_size + - self.img_end_token) + return get_internvl_target_ratios(min_num, max_num) - def _expand_image_prompt( + def get_num_image_tokens( self, - prompt: str, - feature_sizes: List[int], - num_patches: int, - ) -> str: - image_idx = sorted( - map(int, re.findall(r"Image-(\d+): \n", prompt))) + *, + image_width: int, + image_height: int, + ) -> int: + target_ratios = self.resolve_target_ratios( + use_thumbnail=False, # Applied in calculate_targets + ) + + num_patches, _, _ = calculate_internvl_targets( + orig_width=image_width, + orig_height=image_height, + image_size=self.image_size, + target_ratios=target_ratios, + use_thumbnail=self.use_thumbnail, + ) - new_prompt = prompt - for idx, feature_size in enumerate(feature_sizes, start=1): - image_prompt = self._create_image_prompt(feature_size, num_patches) - if not image_idx: - image_prompt = f"Image-{idx}: {image_prompt}" + return num_patches * self.num_image_token - new_prompt = new_prompt.replace('', image_prompt, 1) + def _images_to_pixel_values_lst( + self, + images: list[Image.Image], + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + ) -> list[torch.Tensor]: + min_num, max_num = self.resolve_min_max_num( + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + use_thumbnail=False, # Applied in image_to_pixel_values + ) - return new_prompt + return [ + image_to_pixel_values_internvl( + image, + input_size=self.image_size, + min_num=min_num, + max_num=max_num, + use_thumbnail=self.use_thumbnail, + ) for image in images + ] - def input_processor( + def __call__( self, - ctx: InputContext, - inputs: DecoderOnlyInputs, - *, + text: Optional[Union[str, list[str]]] = None, + images: Optional[Union[Image.Image, list[Image.Image]]] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, - ) -> DecoderOnlyInputs: - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - model_config = ctx.model_config - hf_config = ctx.get_hf_config() - - image_data = multi_modal_data["image"] - num_patches = get_internvl_num_patches(hf_config) - num_blocks_calculator = calculate_num_blocks_wrapper( - hf_config, max_dynamic_patch, dynamic_image_size) - if isinstance(image_data, Image.Image): - width, height = image_data.size - num_blocks, _, _ = num_blocks_calculator(width, height) - image_feature_sizes = [num_blocks * num_patches] - elif is_list_of(image_data, Image.Image): - image_feature_sizes = [] - for image in image_data: - width, height = image.size - num_blocks, _, _ = num_blocks_calculator(width, height) - image_feature_sizes.append(num_blocks * num_patches) - elif isinstance(image_data, torch.Tensor): - num_images, image_feature_size, hidden_size = image_data.shape - image_feature_sizes = [image_feature_size] + return_tensors: Optional[Union[str, TensorType]] = None, + ) -> BatchFeature: + if text is None: + text = [] + if not isinstance(text, list): + text = [text] + if images is None: + images = [] + if not isinstance(images, list): + images = [images] + + if len(images) == 0: + image_inputs = {} else: - raise TypeError(f"Invalid image type: {type(image_data)}") - - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - - prompt = inputs.get("prompt") - prompt_token_ids = inputs["prompt_token_ids"] - if prompt is None: - prompt = tokenizer.decode(prompt_token_ids) - - new_prompt = self._expand_image_prompt(prompt, image_feature_sizes, - num_patches) - new_prompt_token_ids = tokenizer.encode(new_prompt) - img_context_token_id = tokenizer.encode(self.img_context_token, - add_special_tokens=False) - assert len(img_context_token_id) == 1, \ - (f"Invalid image token '{self.img_context_token}': A valid image " - f"token encodes to a single token ID, got {img_context_token_id}.") - img_context_token_id = img_context_token_id[0] - - # Get precise tracking of placeholder positions - token_idx = image_idx = 0 - placeholder_ranges = [] - while token_idx < len(new_prompt_token_ids): - if new_prompt_token_ids[token_idx] == img_context_token_id: - curr_image_featue_size = image_feature_sizes[image_idx] - placeholder_ranges.append( - PlaceholderRange(offset=token_idx, - length=curr_image_featue_size)) - image_idx += 1 - token_idx += curr_image_featue_size - else: - token_idx += 1 + pixel_values_lst = self._images_to_pixel_values_lst( + images, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + image_inputs = { + "pixel_values_flat": torch.cat(pixel_values_lst), + "image_num_patches": list(map(len, pixel_values_lst)), + } + + for pixel_values in pixel_values_lst: + num_patches = pixel_values.shape[0] + feature_size = num_patches * self.num_image_token + + image_repl = self.get_image_repl_full(feature_size, + num_patches) + text = [t.replace('', image_repl, 1) for t in text] + + text_inputs = self.tokenizer(text) + + return BatchFeature( + { + **text_inputs, + **image_inputs, + }, + tensor_type=return_tensors, + ) - return token_inputs( - prompt=prompt, - prompt_token_ids=new_prompt_token_ids, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"image": placeholder_ranges}) - def input_mapper( +class InternVLProcessor(BaseInternVLProcessor): + + @property + def image_token_id(self) -> int: + return self.tokenizer.get_vocab()[IMG_CONTEXT] + + def get_image_repl_features( + self, + feature_size: int, + num_patches: Optional[int], + ) -> str: + return IMG_CONTEXT * feature_size + + def get_image_repl_full( + self, + feature_size: int, + num_patches: Optional[int], + ) -> str: + features = self.get_image_repl_features(feature_size, num_patches) + return IMG_START + features + IMG_END + + +class BaseInternVLProcessingInfo(BaseProcessingInfo): + + @abstractmethod + def get_hf_processor( self, - ctx: InputContext, - data: object, *, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, - ): - hf_config = ctx.get_hf_config() - - image_pixel_values_mapper = image_to_pixel_values_wrapper( - hf_config, max_dynamic_patch, dynamic_image_size) - if isinstance(data, Image.Image): - data = image_pixel_values_mapper(data) - # Add an N dimension for number of images per prompt (currently 1). - data = data.unsqueeze(0) - elif is_list_of(data, Image.Image): - # we can't stack here because images may have different num_patches - data = [image_pixel_values_mapper(img) for img in data] - else: - return MultiModalKwargs({"image_embeds": data}) - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - image_token_id = tokenizer.encode(self.img_context_token, - add_special_tokens=False, - return_tensors="pt")[0] - - return MultiModalKwargs({ - "pixel_values": data, - "image_token_id": image_token_id - }) - - def dummy_data( + ) -> BaseInternVLProcessor: + raise NotImplementedError + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_mm_max_tokens_per_item( self, - ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + return {"image": self.get_max_image_tokens()} + + def get_num_image_tokens( + self, *, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - ): - num_images = mm_counts["image"] + image_width: int, + image_height: int, + processor: Optional[BaseInternVLProcessor], + ) -> int: + if processor is None: + processor = self.get_hf_processor() + + return processor.get_num_image_tokens( + image_width=image_width, + image_height=image_height, + ) - hf_config = ctx.get_hf_config() + def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() - image_feature_size = get_max_internvl_image_tokens( - ctx, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + processor=None, ) - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - - seq_data, ranges = dummy_seq_data_for_clip( - hf_config.vision_config, - seq_len, - num_images, - image_token_id=tokenizer.encode(self.img_context_token, - add_special_tokens=False)[0], - image_feature_size_override=image_feature_size, + + def get_image_size_with_most_features(self) -> ImageSize: + processor = self.get_hf_processor() + + base_size = processor.image_size + target_ratios = processor.resolve_target_ratios() + + largest_feature_size, largest_feature_pinpoint = 0, None + for wr, hr in target_ratios: + width, height = base_size * wr, base_size * hr + + feat_size = self.get_num_image_tokens( + image_width=width, + image_height=height, + processor=processor, + ) + if feat_size > largest_feature_size: + largest_feature_size = feat_size + largest_feature_pinpoint = ImageSize(width=width, + height=height) + + if largest_feature_size == 0 or largest_feature_pinpoint is None: + raise ValueError("Cannot have a largest feature size of 0!") + + return largest_feature_pinpoint + + +_I = TypeVar("_I", bound=BaseInternVLProcessingInfo) + + +class InternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + target_width, target_height = \ + self.info.get_image_size_with_most_features() + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="" * num_images, + mm_data=mm_data, ) - max_image_width, max_image_height = get_max_internvl_image_size( - ctx, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, + +class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, ) - mm_data = dummy_image_for_clip( - hf_config.vision_config, - num_images, - image_width_override=max_image_width, - image_height_override=max_image_height, + image_token_id = self.info.get_hf_processor(**mm_kwargs).image_token_id + image_data = mm_data.get("images", []) + assert isinstance(image_data, list) + + # Since there may be extra tokens in the feature placeholders, + # we need to pass the image token ID to the model to select the + # tokens to merge from the vision encoder outputs + processed_outputs["image_token_id"] = [image_token_id + ] * len(image_data) + + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0)) + + return dict( + pixel_values_flat=MultiModalFieldConfig.flat_from_sizes( + "image", image_num_patches), + image_num_patches=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + image_token_id=MultiModalFieldConfig.batched("image"), ) - return DummyData(seq_data, mm_data, ranges) + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + if "image_num_patches" in out_mm_kwargs: + image_num_patches = out_mm_kwargs["image_num_patches"] + assert isinstance(image_num_patches, torch.Tensor) + image_num_patches = image_num_patches.tolist() + elif "image_embeds" in out_mm_kwargs: + # TODO: Use image size information in dictionary embedding inputs + # to compute num_patches (similar to Qwen2-VL) + image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) + else: + image_num_patches = [] + + def get_replacement_internvl(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + + if isinstance(images, ImageEmbeddingItems): + feature_size = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + feature_size = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + processor=hf_processor, + ) + + num_patches = image_num_patches[item_idx] + if num_patches is not None: + assert isinstance(num_patches, int) + + return PromptReplacementDetails( + full=hf_processor.get_image_repl_full(feature_size, + num_patches), + features=hf_processor.get_image_repl_features( + feature_size, num_patches), + ) + return [ + PromptReplacement( + modality="image", + target="", + replacement=get_replacement_internvl, + ) + ] -input_pipeline = InternVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT) +class InternVLProcessingInfo(BaseInternVLProcessingInfo): -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens) -@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data) -@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor) + def get_hf_processor( + self, + *, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + ) -> InternVLProcessor: + return InternVLProcessor( + self.get_hf_config(), + self.get_tokenizer(), + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + + +@MULTIMODAL_REGISTRY.register_processor( + InternVLMultiModalProcessor, + info=InternVLProcessingInfo, + dummy_inputs=InternVLDummyInputsBuilder) class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: @@ -621,11 +805,11 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[InternVLImageInputs]: - pixel_values = kwargs.pop("pixel_values", None) - image_token_id = kwargs.pop("image_token_id", None) + pixel_values_flat = kwargs.pop("pixel_values_flat", None) + image_num_patches = kwargs.pop("image_num_patches", None) image_embeds = kwargs.pop("image_embeds", None) - if pixel_values is None and image_embeds is None: + if pixel_values_flat is None and image_embeds is None: return None if image_embeds is not None: @@ -638,31 +822,30 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): data=flatten_bn(image_embeds), ) - self.img_context_token_id = image_token_id[0] + image_token_id = kwargs["image_token_id"] + assert isinstance(image_token_id, torch.Tensor) + self.img_context_token_id = image_token_id.flatten().unique().item() - if pixel_values is not None: - if not isinstance(pixel_values, (torch.Tensor, list)): + if pixel_values_flat is not None: + if not isinstance(pixel_values_flat, (torch.Tensor, list)): raise ValueError("Incorrect type of pixel values. " - f"Got type: {type(pixel_values)}") - - patches_per_image = [] - for request_pixel_values in pixel_values: - for image_pixel_values in request_pixel_values: - patches_per_image.append(image_pixel_values.shape[0]) - # We need to flatten (B, N, P) to (B*N*P), - # so we call flatten_bn twice. + f"Got type: {type(pixel_values_flat)}") + + assert isinstance(image_num_patches, (torch.Tensor, list)) + return InternVLImagePixelInputs( type="pixel_values", data=self._validate_pixel_values( - flatten_bn(flatten_bn(pixel_values), concat=True)), - patches_per_image=patches_per_image) + flatten_bn(pixel_values_flat, concat=True)), + patches_per_image=flatten_bn(image_num_patches, + concat=True).tolist()) raise AssertionError("This line should be unreachable.") def _process_image_input( self, image_input: InternVLImageInputs, - ) -> Tuple[torch.Tensor]: + ) -> tuple[torch.Tensor, ...]: if image_input["type"] == "image_embeds": return image_input["data"] @@ -689,7 +872,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): image_embeds = image_embeds.split(image_feature_sizes) return image_embeds - def _set_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor: + def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None: if self.is_mono: self.visual_token_mask = ( input_ids == self.img_context_token_id).reshape(-1, 1) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 19effcbfc..63d308ef6 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -125,7 +125,11 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: return {"image": self.get_max_image_tokens()} def _apply_feature_select_strategy( diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index d70ae2f14..817edcef4 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -62,7 +62,11 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"video": 1} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: target_width, target_height = self.get_image_size_with_most_features() max_video_tokens = self.get_num_video_tokens( diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index f1c06cd85..288942628 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -103,7 +103,11 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: return { "image": self.get_max_image_tokens(), "video": self.get_max_video_tokens(seq_len), diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index f1c168076..ab697fb8c 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -23,7 +23,6 @@ # limitations under the License. """Inference-only MiniCPM-O model compatible with HuggingFace weights.""" from functools import partial -from itertools import accumulate from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -138,11 +137,15 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None, "audio": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: return { "image": self.get_max_image_tokens(), "audio": self.get_max_audio_tokens(), - "video": self.get_max_video_tokens(seq_len) + "video": self.get_max_video_tokens(seq_len), } def get_default_audio_pool_step(self) -> int: @@ -369,23 +372,18 @@ class MiniCPMOMultiModalProcessor( hf_inputs, hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: + audio_num_slices = hf_inputs.get("audio_num_slices", torch.empty(0)) - def get_slices(num_slices: List[int]) -> List[int]: - slice_indices = [0] + list(accumulate(num_slices)) - slices = [(slice_indices[i], slice_indices[i + 1]) - for i in range(len(num_slices))] - return [slice(*slice_item) for slice_item in slices] - - audio_slices = get_slices( - hf_inputs.get("audio_num_slices", torch.empty(0))) return dict( **super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs), - audio_features=MultiModalFieldConfig.flat("audio", audio_slices), - audio_feature_lens=MultiModalFieldConfig.flat( - "audio", audio_slices), + audio_features=MultiModalFieldConfig.flat_from_sizes( + "audio", audio_num_slices), + audio_feature_lens=MultiModalFieldConfig.flat_from_sizes( + "audio", audio_num_slices), audio_num_slices=MultiModalFieldConfig.batched("audio"), audio_orders_in_mm_data=MultiModalFieldConfig.batched("audio"), - audio_embeds=MultiModalFieldConfig.flat("audio", audio_slices)) + audio_embeds=MultiModalFieldConfig.flat_from_sizes( + "audio", audio_num_slices)) class MultiModalProjector(nn.Module): diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 6964d6bdc..3d16d635b 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -26,7 +26,6 @@ import math import re from collections import Counter from functools import cached_property, partial -from itertools import accumulate from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -365,7 +364,11 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo): else: return {"image": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: mm_max_tokens = {"image": self.get_max_image_tokens()} if self.get_model_version() == (2, 6): mm_max_tokens["video"] = self.get_max_video_tokens(seq_len) @@ -761,30 +764,25 @@ class MiniCPMVMultiModalProcessor( hf_inputs, hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: - - def get_slices(num_slices: List[int]) -> List[int]: - slice_indices = [0] + list(accumulate(num_slices)) - slices = [(slice_indices[i], slice_indices[i + 1]) - for i in range(len(num_slices))] - return [slice(*slice_item) for slice_item in slices] - - image_slices = get_slices( - hf_inputs.get("image_num_slices", torch.empty(0))) - video_slices = get_slices( - hf_inputs.get("video_num_slices", torch.empty(0))) - - return dict( - pixel_values=MultiModalFieldConfig.flat("image", image_slices), - image_sizes=MultiModalFieldConfig.batched("image"), - tgt_sizes=MultiModalFieldConfig.flat("image", image_slices), - image_num_slices=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.flat("image", image_slices), - video_pixel_values=MultiModalFieldConfig.flat( - "video", video_slices), - video_image_sizes=MultiModalFieldConfig.batched("video"), - video_tgt_sizes=MultiModalFieldConfig.flat("video", video_slices), - video_embeds=MultiModalFieldConfig.flat("video", video_slices), - video_num_slices=MultiModalFieldConfig.batched("video")) + image_num_slices = hf_inputs.get("image_num_slices", torch.empty(0)) + video_num_slices = hf_inputs.get("video_num_slices", torch.empty(0)) + + return dict(pixel_values=MultiModalFieldConfig.flat_from_sizes( + "image", image_num_slices), + image_sizes=MultiModalFieldConfig.batched("image"), + tgt_sizes=MultiModalFieldConfig.flat_from_sizes( + "image", image_num_slices), + image_num_slices=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.flat_from_sizes( + "image", image_num_slices), + video_pixel_values=MultiModalFieldConfig.flat_from_sizes( + "video", video_num_slices), + video_image_sizes=MultiModalFieldConfig.batched("video"), + video_tgt_sizes=MultiModalFieldConfig.flat_from_sizes( + "video", video_num_slices), + video_embeds=MultiModalFieldConfig.flat_from_sizes( + "video", video_num_slices), + video_num_slices=MultiModalFieldConfig.batched("video")) def apply( self, diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 2aa04bd71..9c674ab46 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -6,44 +6,190 @@ # Copyright (c) 2024 NVIDIA # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- -from typing import Optional +from typing import Mapping, Optional +import torch import torch.nn as nn from transformers import PretrainedConfig -from vllm.inputs import INPUT_REGISTRY from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalKwargs +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + MultiModalDataItems) +from vllm.multimodal.processing import (PromptReplacement, + PromptReplacementDetails) +from vllm.multimodal.profiling import ProcessorInputs from .intern_vit import InternVisionModel -from .internvl import (InternVLChatModel, InternVLInputPipeline, - get_max_internvl_image_tokens) +from .internvl import (BaseInternVLProcessingInfo, BaseInternVLProcessor, + InternVLChatModel, InternVLDummyInputsBuilder, + InternVLMultiModalProcessor) -IMG_START = '<|vision_start|>' -IMG_END = '<|vision_end|>' -IMG_CONTEXT = '<|vision_pad|>' +IMG_PAD = "<|vision_pad|>" -class NVLMInputPipeline(InternVLInputPipeline): +class NVLMProcessor(BaseInternVLProcessor): + + @property + def image_token_id(self) -> int: + return self.tokenizer.get_vocab()[IMG_PAD] + + def get_image_repl_features( + self, + feature_size: int, + num_patches: Optional[int], + ) -> str: + if num_patches is None: + raise NotImplementedError("Embedding inputs are not supported") + + tile_pos_identifiers = [f"" for i in range(1, num_patches)] + if self.use_thumbnail and num_patches != 1: + tile_pos_identifiers += [""] - def _create_image_prompt(self, feature_size: int, num_patches: int) -> str: - tile_pos_identifiers = ([f"" - for i in range(1, num_patches)] + - [""]) context_size = feature_size // num_patches + features = "".join(identifier + IMG_PAD * context_size + for identifier in tile_pos_identifiers) + + # We include the start and end as well because "<", "tile"], resulting in assertion error + # when trying to find "" + features + "" + + def get_image_repl_full( + self, + feature_size: int, + num_patches: Optional[int], + ) -> str: + return self.get_image_repl_features(feature_size, num_patches) + + +class NVLMProcessingInfo(BaseInternVLProcessingInfo): + + def get_hf_processor( + self, + *, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + ) -> NVLMProcessor: + return NVLMProcessor( + self.get_hf_config(), + self.get_tokenizer(), + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + + def get_max_image_tokens(self) -> int: + hf_processor = self.get_hf_processor() + tokenizer = hf_processor.tokenizer + + max_num_patches = hf_processor.max_dynamic_patch + # we need +1 here because max_dynamic_patch in config doesn't + # include the thumbnail patch + tile_pos_identifiers = [ + f"" for i in range(max_num_patches) + ] + if hf_processor.use_thumbnail and max_num_patches != 1: + tile_pos_identifiers += [""] + + # "<", "tile"] + # so we include in the start_str + start_str = "" + tile_pos_identifiers.pop(0) + end_str = "" + start_token_len = len(tokenizer.encode(start_str)) + end_token_len = len(tokenizer.encode(end_str)) + tile_token_len = sum( + len(tokenizer.encode(identifier)) + for identifier in tile_pos_identifiers) + non_image_tokens_num = start_token_len + end_token_len + tile_token_len + return super().get_max_image_tokens() + non_image_tokens_num + + +class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + target_width, target_height = \ + self.info.get_image_size_with_most_features() + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + return ProcessorInputs( + # The newline is necessary to separate ">" of the current item + # and "<" of the next item + prompt_text="\n" * num_images, + mm_data=mm_data, + ) + + +class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]): - return '' + ''.join( - tile_pos_identifier + self.img_context_token * context_size - for tile_pos_identifier in tile_pos_identifiers) + '' + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + if "image_num_patches" in out_mm_kwargs: + image_num_patches = out_mm_kwargs["image_num_patches"] + assert isinstance(image_num_patches, torch.Tensor) + image_num_patches = image_num_patches.tolist() + elif "image_embeds" in out_mm_kwargs: + # TODO: Use image size information in dictionary embedding inputs + # to compute num_patches (similar to Qwen2-VL) + image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) + else: + image_num_patches = [] + + def get_replacement_nvlm(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + if isinstance(images, ImageEmbeddingItems): + feature_size = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + feature_size = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + processor=hf_processor, + ) + + num_patches = image_num_patches[item_idx] + if num_patches is not None: + assert isinstance(num_patches, int) + + return PromptReplacementDetails( + full=hf_processor.get_image_repl_full(feature_size, + num_patches) + "\n", + features=hf_processor.get_image_repl_features( + feature_size, num_patches) + "\n", + ) -input_pipeline = NVLMInputPipeline(IMG_START, IMG_END, IMG_CONTEXT) + # See note in dummy data regarding why we have the extra newline + return [ + PromptReplacement( + modality="image", + target="\n", + replacement=get_replacement_nvlm, + ) + ] -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens) -@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data) -@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor) +@MULTIMODAL_REGISTRY.register_processor(NVLMMultiModalProcessor, + info=NVLMProcessingInfo, + dummy_inputs=NVLMDummyInputsBuilder) class NVLM_D_Model(InternVLChatModel): def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential: diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index f089fa5d2..053390c52 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -322,7 +322,11 @@ class Phi3VProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: target_width, target_height = self.get_image_size_with_most_features() max_image_tokens = self.get_num_image_tokens( diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index d7f6662bc..327fad0f5 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -779,7 +779,11 @@ class QWenVLProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: return {"image": self.get_num_image_tokens()} def get_num_image_tokens(self) -> int: @@ -799,13 +803,13 @@ class QWenVLDummyInputsBuilder(BaseDummyInputsBuilder[QWenVLProcessingInfo]): vision_config = hf_config.visual - max_image_size = vision_config["image_size"] + target_width = target_height = vision_config["image_size"] num_images = mm_counts.get("image", 0) mm_data = { "image": - self._get_dummy_images(width=max_image_size, - height=max_image_size, + self._get_dummy_images(width=target_width, + height=target_height, num_images=num_images) } diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index cf104ab00..f09529ca4 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -110,7 +110,11 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: hf_config = self.get_hf_config() max_source_positions = hf_config.audio_config.max_source_positions max_output_lengths = (max_source_positions - 2) // 2 + 1 diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 189ac41e8..2b2638cf6 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -758,7 +758,11 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: return { "image": self.get_max_image_tokens(), "video": self.get_max_video_tokens(seq_len), @@ -989,26 +993,21 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3))) - image_slice_idxs = [0] + image_grid_thw.prod(-1).cumsum_(0).tolist() - image_slices = [ - slice(image_slice_idxs[i], image_slice_idxs[i + 1]) - for i in range(len(image_grid_thw)) - ] + image_grid_sizes = image_grid_thw.prod(-1) video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3))) - video_slice_idxs = [0] + video_grid_thw.prod(-1).cumsum_(0).tolist() - video_slices = [ - slice(video_slice_idxs[i], video_slice_idxs[i + 1]) - for i in range(len(video_grid_thw)) - ] + video_grid_sizes = video_grid_thw.prod(-1) return dict( - pixel_values=MultiModalFieldConfig.flat("image", image_slices), - image_embeds=MultiModalFieldConfig.flat("image", image_slices), + pixel_values=MultiModalFieldConfig.flat_from_sizes( + "image", image_grid_sizes), + image_embeds=MultiModalFieldConfig.flat_from_sizes( + "image", image_grid_sizes), image_grid_thw=MultiModalFieldConfig.batched("image"), - pixel_values_videos=MultiModalFieldConfig.flat( - "video", video_slices), - video_embeds=MultiModalFieldConfig.flat("video", video_slices), + pixel_values_videos=MultiModalFieldConfig.flat_from_sizes( + "video", video_grid_sizes), + video_embeds=MultiModalFieldConfig.flat_from_sizes( + "video", video_grid_sizes), video_grid_thw=MultiModalFieldConfig.batched("video"), ) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 5e86b15db..52a4d798f 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -92,7 +92,11 @@ class UltravoxProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: feature_extractor = self.get_feature_extractor() max_audio_tokens = math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index eb52551bb..fe24c7282 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -4,6 +4,7 @@ from abc import ABC, abstractmethod from collections import UserDict, defaultdict from collections.abc import Mapping, Sequence from dataclasses import dataclass +from itertools import accumulate from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar, Union, cast, final) @@ -258,6 +259,16 @@ class MultiModalFieldConfig: slices=slices, ) + @staticmethod + def flat_from_sizes(modality: str, size_per_item: torch.Tensor): + slice_idxs = [0, *accumulate(size_per_item)] + slices = [ + slice(slice_idxs[i], slice_idxs[i + 1]) + for i in range(len(size_per_item)) + ] + + return MultiModalFieldConfig.flat(modality, slices) + def __init__( self, field_cls: type[BaseMultiModalField], diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 2ad42d1c1..d704fa59b 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -680,7 +680,11 @@ class BaseProcessingInfo: raise NotImplementedError @abstractmethod - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: """ Get the maximum possible number of tokens per data item for each modality. diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 953c01000..5dd754854 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -151,7 +151,8 @@ class MultiModalProfiler(Generic[_I]): mm_counts = self.get_mm_limits() info = self.processing_info - mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(seq_len) + mm_max_tokens_per_item = info.get_mm_max_tokens_per_item( + seq_len, mm_counts) if mm_counts.keys() != mm_max_tokens_per_item.keys(): raise AssertionError( diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 29036691b..041411142 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -264,7 +264,9 @@ class MultiModalRegistry: ) processor = self.create_processor(model_config, tokenizer) seq_len = model_config.max_model_len - return processor.info.get_mm_max_tokens_per_item(seq_len) + mm_limits = self.get_mm_limits_per_prompt(model_config) + return processor.info.get_mm_max_tokens_per_item( + seq_len, mm_limits) return { key: plugin.get_max_multimodal_tokens(model_config) -- GitLab From 18a88fcccce73261d51a18aba17368236ceb2f8b Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 4 Feb 2025 02:43:58 -0800 Subject: [PATCH 31/65] [V1] Remove scheduling constraint on partial requests (#12674) Signed-off-by: Woosuk Kwon --- tests/v1/core/test_scheduler.py | 214 +++++++++++++++++++++++++++++ vllm/v1/core/scheduler.py | 129 ++++++++--------- vllm/v1/worker/block_table.py | 2 + vllm/v1/worker/gpu_model_runner.py | 128 ++++++++++------- 4 files changed, 350 insertions(+), 123 deletions(-) create mode 100644 tests/v1/core/test_scheduler.py diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py new file mode 100644 index 000000000..8eb08f3e8 --- /dev/null +++ b/tests/v1/core/test_scheduler.py @@ -0,0 +1,214 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import List, Optional + +from vllm.config import CacheConfig, ModelConfig, SchedulerConfig +from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange +from vllm.sampling_params import SamplingParams +from vllm.v1.core.scheduler import Scheduler +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.request import Request, RequestStatus + + +def create_scheduler( + model: str = "facebook/opt-125m", + max_num_seqs: int = 16, + max_num_batched_tokens: int = 8192, +) -> Scheduler: + scheduler_config = SchedulerConfig( + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + max_model_len=max_num_batched_tokens, + ) + model_config = ModelConfig( + model=model, + task="auto", + tokenizer=model, + tokenizer_mode="auto", + trust_remote_code=True, + dtype="float16", + seed=42, + ) + cache_config = CacheConfig( + block_size=16, + gpu_memory_utilization=0.9, + swap_space=0, + cache_dtype="auto", + ) + cache_config.num_gpu_blocks = 10000 + return Scheduler(scheduler_config, + model_config, + cache_config, + lora_config=None) + + +def create_requests( + num_requests: int, + num_tokens: int = 10, + mm_positions: Optional[List[PlaceholderRange]] = None, +): + sampling_params = SamplingParams() + requests = [] + for i in range(num_requests): + if mm_positions is not None: + mm_position = mm_positions[i] + mm_inputs = [MultiModalKwargs({})] * len(mm_position) + else: + mm_position = None + mm_inputs = None + request = Request( + request_id=f"{i}", + prompt=None, + prompt_token_ids=[i] * num_tokens, + sampling_params=sampling_params, + multi_modal_inputs=mm_inputs, + multi_modal_placeholders=mm_position, + multi_modal_hashes=None, + eos_token_id=None, + arrival_time=0, + ) + requests.append(request) + return requests + + +def test_add_requests(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + + for i, request in enumerate(requests): + scheduler.add_request(request) + assert request.request_id in scheduler.requests + assert len(scheduler.waiting) == i + 1 + + +def test_finish_request(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + for i, request in enumerate(requests): + scheduler.finish_requests(request.request_id, + RequestStatus.FINISHED_ABORTED) + assert request.request_id not in scheduler.requests + assert len(scheduler.waiting) == 9 - i + + +def test_get_num_unfinished_requests(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + for i, request in enumerate(requests): + scheduler.finish_requests(request.request_id, + RequestStatus.FINISHED_STOPPED) + assert scheduler.get_num_unfinished_requests() == len(requests) - i - 1 + + +def test_schedule(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + # Test initial scheduling + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == len(requests) + assert len(output.scheduled_cached_reqs) == 0 + assert len(output.finished_req_ids) == 0 + # Verify all requests are scheduled. + for req_id, num_tokens in output.num_scheduled_tokens.items(): + assert num_tokens == len(requests[int(req_id)].prompt_token_ids) + + # Verify requests moved from waiting to running + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == len(requests) + for i, request in enumerate(requests): + assert scheduler.running[i] == request + + +def test_schedule_multimodal_requests(): + scheduler = create_scheduler(model="llava-hf/llava-1.5-7b-hf") + mm_positions = [[PlaceholderRange(offset=i, length=100)] + for i in range(10)] + requests = create_requests( + num_requests=10, + num_tokens=200, + mm_positions=mm_positions, + ) + for request in requests: + scheduler.add_request(request) + + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == len(requests) + assert len(output.scheduled_cached_reqs) == 0 + assert len(output.finished_req_ids) == 0 + for req_id, num_tokens in output.num_scheduled_tokens.items(): + assert num_tokens == len(requests[int(req_id)].prompt_token_ids) + assert len(output.scheduled_encoder_inputs) == 10 + for req_id, encoder_input in output.scheduled_encoder_inputs.items(): + assert len(encoder_input) == 1 + + +def test_schedule_partial_requests(): + """Test scheduling behavior with partial requests. + + This test verifies that: + 1. The scheduler can handle multiple partial requests in a single step when + constrained by encoder budget. + 2. A request in RUNNING state may be unscheduled in subsequent steps if + there is insufficient encoder budget. + """ + scheduler = create_scheduler( + model="llava-hf/llava-1.5-7b-hf", + max_num_batched_tokens=1024, + ) + mm_positions = [[PlaceholderRange(offset=100, length=600)] + for _ in range(3)] + requests = create_requests( + num_requests=3, + num_tokens=800, + mm_positions=mm_positions, + ) + for request in requests: + scheduler.add_request(request) + + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == 3 + assert len(output.scheduled_cached_reqs) == 0 + assert len(output.finished_req_ids) == 0 + + assert scheduler.max_num_encoder_input_tokens == 1024 + # The first request is scheduled fully. + assert output.num_scheduled_tokens[requests[0].request_id] == 800 + # The second request is scheduled partially. + # The tokens are not scheduled because of the encoder budget. + assert output.num_scheduled_tokens[requests[1].request_id] == 100 + # The third request is also scheduled partially. + # The tokens are not scheduled because of the encoder budget. + assert output.num_scheduled_tokens[requests[2].request_id] == 100 + req_to_index = { + request.request_id: i + for i, request in enumerate(requests) + } + model_runner_output = ModelRunnerOutput( + req_ids=[request.request_id for request in requests], + req_id_to_index=req_to_index, + sampled_token_ids=[0] * len(requests), + logprob_token_ids_cpu=None, + logprobs_cpu=None, + ) + scheduler.update_from_output(output, model_runner_output) + + # Schedule the next step. + # Only the first and second requests are scheduled. + # The third request is in the RUNNING state but not scheduled in this step + # because of the encoder budget. + output = scheduler.schedule() + assert len(scheduler.running) == 3 + assert len(output.scheduled_new_reqs) == 0 + assert len(output.scheduled_cached_reqs) == 2 + assert len(output.finished_req_ids) == 0 + assert output.num_scheduled_tokens[requests[0].request_id] == 1 + assert output.num_scheduled_tokens[requests[1].request_id] == 700 + assert requests[2].request_id not in output.num_scheduled_tokens diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index f4738bb33..fb5e83fe0 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -67,10 +67,10 @@ class Scheduler: # This is flushed at the end of each scheduling step. self.finished_req_ids: Set[str] = set() - # OPTIMIZATION: Cache the RunningRequestData objects to avoid creating + # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating # them at each scheduling step. - # Request id -> RunningRequestData - self.running_reqs_data: Dict[str, RunningRequestData] = {} + # Request id -> CachedRequestData + self._cached_reqs_data: Dict[str, CachedRequestData] = {} # Encoder-related. # Calculate encoder cache size if applicable @@ -115,17 +115,8 @@ class Scheduler: encoder_budget = self.max_num_encoder_input_tokens # First, schedule the RUNNING requests. - # NOTE(woosuk): At most 1 request in the RUNNING queue is allowed to be - # in the "partial" state, where the request has some tokens computed - # but not all. The constraint is due to the persistent batch in the - # V1 model runner. - # TODO(woosuk): Remove this constraint after refactoring model runner. - has_partial_request = False req_index = 0 - while req_index < len(self.running): - # Only the last request in the RUNNING queue can be "partial". - assert not has_partial_request - assert token_budget > 0 + while req_index < len(self.running) and token_budget > 0: request = self.running[req_index] num_new_tokens = request.num_tokens - request.num_computed_tokens num_new_tokens = min(num_new_tokens, token_budget) @@ -137,7 +128,14 @@ class Scheduler: request.num_computed_tokens, num_new_tokens, encoder_budget)) - assert num_new_tokens > 0 + if num_new_tokens == 0: + # The request cannot be scheduled because the encoder budget + # or the encoder cache is exhausted. + # NOTE(woosuk): Here, by doing `continue` instead of `break`, + # we do not strictly follow the FCFS scheduling policy and + # allow the lower-priority requests to be scheduled. + req_index += 1 + continue while True: new_blocks = self.kv_cache_manager.allocate_slots( @@ -172,8 +170,6 @@ class Scheduler: num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens req_index += 1 - has_partial_request = (request.num_computed_tokens + num_new_tokens - < request.num_tokens) # Encoder-related. if encoder_inputs_to_schedule: @@ -186,13 +182,9 @@ class Scheduler: # Next, schedule the WAITING requests. if not preempted_reqs: - while self.waiting: - if has_partial_request: - break + while self.waiting and token_budget > 0: if len(self.running) == self.max_num_running_reqs: break - if token_budget == 0: - break request = self.waiting[0] # Get already-cached tokens. @@ -249,8 +241,6 @@ class Scheduler: token_budget -= num_new_tokens request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens - has_partial_request = (num_computed_tokens + num_new_tokens - < request.num_tokens) # Encoder-related. if encoder_inputs_to_schedule: @@ -266,8 +256,11 @@ class Scheduler: assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens assert token_budget >= 0 assert len(self.running) <= self.max_num_running_reqs + # Since some requests in the RUNNING queue may not be scheduled in + # this step, the total number of scheduled requests can be smaller than + # len(self.running). assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + - len(scheduled_running_reqs) == len(self.running)) + len(scheduled_running_reqs) <= len(self.running)) # Get the longest common prefix among all requests in the running queue. # This can be potentially used for cascade attention. @@ -286,25 +279,28 @@ class Scheduler: for req in scheduled_new_reqs ] resumed_reqs_data = [ - ResumedRequestData.from_request( - req, req_to_new_block_ids[req.request_id], - req.num_computed_tokens) for req in scheduled_resumed_reqs + self._make_cached_request_data( + req, + req_to_new_block_ids[req.request_id], + req.num_computed_tokens, + resumed_from_preemption=True, + ) for req in scheduled_resumed_reqs ] running_reqs_data = [ - self._make_running_request_data( - req, req_to_new_block_ids[req.request_id], - req.num_computed_tokens) for req in scheduled_running_reqs + self._make_cached_request_data( + req, + req_to_new_block_ids[req.request_id], + req.num_computed_tokens, + resumed_from_preemption=False, + ) for req in scheduled_running_reqs ] - preempted_req_ids = {req.request_id for req in preempted_reqs} scheduler_output = SchedulerOutput( scheduled_new_reqs=new_reqs_data, - scheduled_resumed_reqs=resumed_reqs_data, - scheduled_running_reqs=running_reqs_data, + scheduled_cached_reqs=resumed_reqs_data + running_reqs_data, num_scheduled_tokens=num_scheduled_tokens, total_num_scheduled_tokens=total_num_scheduled_tokens, scheduled_encoder_inputs=scheduled_encoder_inputs, num_common_prefix_blocks=num_common_prefix_blocks, - preempted_req_ids=preempted_req_ids, # finished_req_ids is an existing state in the scheduler, # instead of being newly scheduled in this step. # It contains the request IDs that are finished in between @@ -316,22 +312,26 @@ class Scheduler: self.finished_req_ids = set() return scheduler_output - def _make_running_request_data( + def _make_cached_request_data( self, request: Request, new_block_ids: List[int], num_computed_tokens: int, - ) -> "RunningRequestData": - # OPTIMIZATION: Cache the RunningRequestData objects to avoid creating + resumed_from_preemption: bool, + ) -> "CachedRequestData": + # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating # them at each scheduling step. - if request.request_id in self.running_reqs_data: - req_data = self.running_reqs_data[request.request_id] + if request.request_id in self._cached_reqs_data: + req_data = self._cached_reqs_data[request.request_id] + req_data.resumed_from_preemption = resumed_from_preemption req_data.new_block_ids = new_block_ids req_data.num_computed_tokens = num_computed_tokens else: - req_data = RunningRequestData.from_request(request, new_block_ids, - num_computed_tokens) - self.running_reqs_data[request.request_id] = req_data + req_data = CachedRequestData.from_request(request, + resumed_from_preemption, + new_block_ids, + num_computed_tokens) + self._cached_reqs_data[request.request_id] = req_data return req_data def _try_schedule_encoder_inputs( @@ -420,7 +420,13 @@ class Scheduler: # expensive operations inside the loop. for request in self.running: req_id = request.request_id - request.num_computed_tokens += num_scheduled_tokens[req_id] + num_tokens_scheduled = num_scheduled_tokens.get(req_id, 0) + if num_tokens_scheduled == 0: + # The request was not scheduled in this step. + new_running.append(request) + continue + + request.num_computed_tokens += num_tokens_scheduled # When the request's num_computed_tokens catches up its num_tokens, # the request generates output tokens. Otherwise, we ignore the # sampler output for the request. @@ -529,7 +535,7 @@ class Scheduler: assert request.is_finished() self.kv_cache_manager.free(request) self.encoder_cache_manager.free(request) - self.running_reqs_data.pop(request.request_id, None) + self._cached_reqs_data.pop(request.request_id, None) del self.requests[request.request_id] self.finished_req_ids.add(request.request_id) @@ -584,30 +590,13 @@ class NewRequestData: @dataclass -class ResumedRequestData: - - req_id: str - block_ids: List[int] - num_computed_tokens: int - - @classmethod - def from_request( - cls, - request: Request, - block_ids: List[int], - num_computed_tokens: int, - ) -> "ResumedRequestData": - return cls( - req_id=request.request_id, - block_ids=block_ids, - num_computed_tokens=num_computed_tokens, - ) - - -@dataclass -class RunningRequestData: +class CachedRequestData: req_id: str + # If resumed_from_preemption is False, new_block_ids will be appended to + # the request's block IDs. If True, new_block_ids will be used as the + # request's block IDs instead of appending to the existing block IDs. + resumed_from_preemption: bool new_block_ids: List[int] num_computed_tokens: int @@ -615,11 +604,13 @@ class RunningRequestData: def from_request( cls, request: Request, + resumed_from_preemption: bool, new_block_ids: List[int], num_computed_tokens: int, - ) -> "RunningRequestData": + ) -> "CachedRequestData": return cls( req_id=request.request_id, + resumed_from_preemption=resumed_from_preemption, new_block_ids=new_block_ids, num_computed_tokens=num_computed_tokens, ) @@ -629,14 +620,12 @@ class RunningRequestData: class SchedulerOutput: scheduled_new_reqs: List[NewRequestData] - scheduled_resumed_reqs: List[ResumedRequestData] - scheduled_running_reqs: List[RunningRequestData] + scheduled_cached_reqs: List[CachedRequestData] num_scheduled_tokens: Dict[str, int] total_num_scheduled_tokens: int scheduled_encoder_inputs: Dict[str, List[int]] num_common_prefix_blocks: int - preempted_req_ids: Set[str] finished_req_ids: Set[str] free_encoder_input_ids: List[Tuple[str, int]] diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 8d0785243..f520ee958 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -46,6 +46,8 @@ class BlockTable: start: int, block_ids: List[int], ) -> None: + if not block_ids: + return num_blocks = len(block_ids) self.block_table_np[row_idx, start:start + num_blocks] = block_ids self.num_blocks_per_row[row_idx] = start + num_blocks diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0b5644525..7841fac1d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -205,12 +205,32 @@ class GPUModelRunner: pin_memory=self.pin_memory) self.seq_lens_np = self.seq_lens_cpu.numpy() - def _update_states(self, scheduler_output: "SchedulerOutput") -> None: - # Remove stopped requests from the cached states. - # Keep the states of the preempted requests. + def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: + """Update the cached states and the persistent batch with the scheduler + output. + + The updated states are used by the `_prepare_inputs` function to create + the input GPU tensors for the model. + + Returns: + True if there is a new/resumed/paused/finished request in the batch. + If False, we can skip copying SamplingMetadata to the GPU. + """ + # Remove finished requests from the cached states. for req_id in scheduler_output.finished_req_ids: self.requests.pop(req_id, None) self.encoder_cache.pop(req_id, None) + # Remove the finished requests from the persistent batch. + # NOTE(woosuk): There could be an edge case where finished_req_ids and + # scheduled_req_ids overlap. This happens when a request is aborted and + # then resubmitted with the same ID. In this case, we treat them as two + # distinct requests - clearing the cached states for the first request + # and handling the second as a new request. + removed_req_indices: List[int] = [] + for req_id in scheduler_output.finished_req_ids: + req_index = self.input_batch.remove_request(req_id) + if req_index is not None: + removed_req_indices.append(req_index) # Free the cached encoder outputs. for req_id, input_id in scheduler_output.free_encoder_input_ids: @@ -220,36 +240,22 @@ class GPUModelRunner: if not encoder_outputs: self.encoder_cache.pop(req_id, None) - # Remove the requests from the persistent batch. - stopped_req_ids = set().union( - scheduler_output.preempted_req_ids, - scheduler_output.finished_req_ids, - ) - removed_req_indices: List[int] = [] - for req_id in stopped_req_ids: + # Remove the unscheduled requests from the persistent batch. + # NOTE(woosuk): The unscheduled requests are either preempted requests + # or running requests that are not scheduled in this step. We remove + # them from the persistent batch but keep their cached states since + # they will be scheduled again sometime in the future. + scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys() + cached_req_ids = self.input_batch.req_id_to_index.keys() + unscheduled_req_ids = cached_req_ids - scheduled_req_ids + # NOTE(woosuk): The persistent batch optimization assumes that + # consecutive batches contain mostly the same requests. If batches + # have low request overlap (e.g., alternating between two distinct + # sets of requests), this optimization becomes very inefficient. + for req_id in unscheduled_req_ids: req_index = self.input_batch.remove_request(req_id) - if req_index is not None: - removed_req_indices.append(req_index) - - # Update the states of the running requests. - for req_data in scheduler_output.scheduled_running_reqs: - req_id = req_data.req_id - req_state = self.requests[req_id] - req_index = self.input_batch.req_id_to_index[req_id] - - # Update the num_computed_tokens. - req_state.num_computed_tokens = req_data.num_computed_tokens - self.input_batch.num_computed_tokens_cpu[req_index] = ( - req_data.num_computed_tokens) - - # Update the block table. - num_new_blocks = len(req_data.new_block_ids) - if num_new_blocks == 0: - continue - start_index = len(req_state.block_ids) - req_state.block_ids.extend(req_data.new_block_ids) - self.input_batch.block_table.append_row(req_index, start_index, - req_data.new_block_ids) + assert req_index is not None + removed_req_indices.append(req_index) req_ids_to_add: List[str] = [] # Add new requests to the cached states. @@ -305,14 +311,36 @@ class GPUModelRunner: req_ids_to_add.append(req_id) - # Update the cached states of the resumed requests. - for res_req_data in scheduler_output.scheduled_resumed_reqs: - req_id = res_req_data.req_id + # Update the states of the running/resumed requests. + for req_data in scheduler_output.scheduled_cached_reqs: + req_id = req_data.req_id req_state = self.requests[req_id] - req_state.block_ids = res_req_data.block_ids - req_state.num_computed_tokens = res_req_data.num_computed_tokens - req_ids_to_add.append(req_id) + # Update the cached states. + req_state.num_computed_tokens = req_data.num_computed_tokens + if not req_data.resumed_from_preemption: + # Append the new blocks to the existing block IDs. + req_state.block_ids.extend(req_data.new_block_ids) + else: + # The request is resumed from preemption. + # Replace the existing block IDs with the new ones. + req_state.block_ids = req_data.new_block_ids + + req_index = self.input_batch.req_id_to_index.get(req_id) + if req_index is None: + # The request is not in the persistent batch. + # The request was either preempted and resumed later, or was not + # scheduled in the previous step and needs to be added again. + req_ids_to_add.append(req_id) + continue + + # Update the persistent batch. + self.input_batch.num_computed_tokens_cpu[req_index] = ( + req_data.num_computed_tokens) + start_index = len(req_state.block_ids) - len( + req_data.new_block_ids) + self.input_batch.block_table.append_row(req_index, start_index, + req_data.new_block_ids) # Add the new or resumed requests to the persistent batch. # The smaller empty indices are filled first. @@ -330,6 +358,7 @@ class GPUModelRunner: # Condense the batched states if there are empty indices. if removed_req_indices: self.input_batch.condense(removed_req_indices) + return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0 def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens @@ -536,10 +565,10 @@ class GPUModelRunner: prefix_kv_lens=prefix_kv_lens, suffix_kv_lens=suffix_kv_lens, ) - # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial - # request in the batch. While we should not sample any token from this - # partial request, we do so for simplicity. We will ignore the sampled - # token from the partial request. + # NOTE(woosuk): Due to chunked prefills, the batch may contain partial + # requests. While we should not sample any token from these partial + # requests, we do so for simplicity. We will ignore the sampled + # tokens from the partial requests. # TODO: Support prompt logprobs. logits_indices = query_start_loc[1:] - 1 return attn_metadata, logits_indices @@ -601,22 +630,15 @@ class GPUModelRunner: def _prepare_sampling( self, - scheduler_output: "SchedulerOutput", + batch_changed: bool, ) -> SamplingMetadata: - skip_copy = True - if (scheduler_output.finished_req_ids - or scheduler_output.preempted_req_ids): - skip_copy = False - if (scheduler_output.scheduled_new_reqs - or scheduler_output.scheduled_resumed_reqs): - skip_copy = False # Create the sampling metadata. req_id_output_token_ids: Dict[str, List[int]] = \ {req_id: req.output_token_ids \ for req_id, req in self.requests.items()} sampling_metadata = self.input_batch.make_sampling_metadata( - req_id_output_token_ids, skip_copy) + req_id_output_token_ids, skip_copy=not batch_changed) return sampling_metadata def _execute_encoder(self, scheduler_output: "SchedulerOutput"): @@ -715,7 +737,7 @@ class GPUModelRunner: self, scheduler_output: "SchedulerOutput", ) -> ModelRunnerOutput: - self._update_states(scheduler_output) + batch_changed = self._update_states(scheduler_output) if self.is_multimodal_model: # Run the multimodal encoder if any. @@ -778,7 +800,7 @@ class GPUModelRunner: logits = self.model.compute_logits(hidden_states, None) # Sample the next token and get logprobs if needed. - sampling_metadata = self._prepare_sampling(scheduler_output) + sampling_metadata = self._prepare_sampling(batch_changed) sampler_output = self.model.sample( logits=logits, sampling_metadata=sampling_metadata, -- GitLab From 815079de8e9dd984d474f7046412d5aedf4350ff Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 4 Feb 2025 20:00:51 +0800 Subject: [PATCH 32/65] [VLM] merged multimodal processor and V1 support for idefics3 (#12660) Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung --- docs/source/models/supported_models.md | 2 +- .../vision_language/test_models.py | 4 +- .../vision_language/vlm_utils/model_utils.py | 8 + .../multimodal/processing/test_common.py | 1 + .../multimodal/processing/test_idefics3.py | 179 ++---- vllm/inputs/registry.py | 18 + vllm/model_executor/models/idefics3.py | 560 ++++++++---------- 7 files changed, 315 insertions(+), 457 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index fbdca189a..d8e284292 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -733,7 +733,7 @@ See [this page](#generative-models) for more information on how to use generativ * `HuggingFaceM4/Idefics3-8B-Llama3` etc. * ✅︎ * - * + * ✅︎ - * `InternVLChatModel` * InternVL 2.5, Mono-InternVL, InternVL 2.0 * T + IE+ diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 7a14ba2f3..5fe46bd75 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -254,14 +254,14 @@ VLM_TEST_SETTINGS = { patch_hf_runner=model_utils.h2ovl_patch_hf_runner, ), "idefics3": VLMTestInfo( - models=["HuggingFaceM4/Idefics3-8B-Llama3"], + models=["HuggingFaceTB/SmolVLM-256M-Instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501 img_idx_to_prompt=lambda idx: "", max_model_len=8192, max_num_seqs=2, auto_cls=AutoModelForVision2Seq, - marks=[large_gpu_mark(min_gb=48)], + hf_output_post_proc=model_utils.idefics3_trunc_hf_output, ), "intern_vl": VLMTestInfo( models=[ diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index d2401b222..ced891e1e 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -192,6 +192,14 @@ def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, return output_ids, output_str, out_logprobs +def idefics3_trunc_hf_output(hf_output: RunnerOutput, + model: str) -> RunnerOutput: + output_ids, output_str, out_logprobs = hf_output + if output_str.endswith(""): + output_str = output_str.split("")[0] + return output_ids, output_str, out_logprobs + + def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput: output_ids, output_str, out_logprobs = hf_output diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 07906a71d..5cd749cbd 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -149,6 +149,7 @@ def _test_processing_correctness( "adept/fuyu-8b", "h2oai/h2ovl-mississippi-800m", "OpenGVLab/InternVL2-1B", + "HuggingFaceM4/Idefics3-8B-Llama3", "llava-hf/llava-1.5-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/LLaVA-NeXT-Video-7B-hf", diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py index 00c1dae51..07ab1bbd4 100644 --- a/tests/models/multimodal/processing/test_idefics3.py +++ b/tests/models/multimodal/processing/test_idefics3.py @@ -1,13 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 """Tests for Idefics3's multimodal preprocessing kwargs.""" -from typing import Optional - import pytest -import torch -from transformers import AutoImageProcessor, AutoTokenizer +from transformers import Idefics3Config -from vllm.inputs import InputContext, token_inputs -from vllm.multimodal import MultiModalRegistry +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.utils import cached_get_tokenizer from ....conftest import _ImageAssets from ...utils import build_model_context @@ -15,163 +12,53 @@ from ...utils import build_model_context models = ["HuggingFaceM4/Idefics3-8B-Llama3"] -# Wrap lazy imports to avoid initializing CUDA during test collection -@pytest.fixture() -def input_processor_for_idefics3(): - from vllm.model_executor.models.idefics3 import ( - input_processor_for_idefics3) - return input_processor_for_idefics3 - - -@pytest.fixture() -def dummy_data_for_idefics3(): - from vllm.model_executor.models.idefics3 import dummy_data_for_idefics3 - return dummy_data_for_idefics3 - - -@pytest.fixture() -def get_max_idefics3_image_tokens(): - from vllm.model_executor.models.idefics3 import ( - get_max_idefics3_image_tokens) - return get_max_idefics3_image_tokens - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("longest_edge", [None, 168, 336, 400, 2 * 336]) -def test_input_mapper_override(model: str, image_assets: _ImageAssets, - longest_edge: Optional[int]): - """Ensure that the [default] input mapper handles size properly.""" - - mm_processor_kwargs = { - "size": { - "longest_edge": longest_edge - } - } if longest_edge is not None else {} - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - mm_processor_kwargs=mm_processor_kwargs, - ) - - hf_processor = AutoImageProcessor.from_pretrained(model, - trust_remote_code=True, - **mm_processor_kwargs) - - mm_registry = MultiModalRegistry() - mm_registry.init_mm_limits_per_prompt(ctx.model_config) - - image = image_assets[0].pil_image - hf_result = hf_processor.preprocess( - image, - return_tensors="pt", - ) - - vllm_result = mm_registry.map_input( - ctx.model_config, - {"image": image}, - ) - - assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"]) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("longest_edge, expected_max_tokens", [ - (None, 2873), - (168, 169), - (336, 169), - (400, 338), - (672, 338), -]) -def test_max_tokens_override(get_max_idefics3_image_tokens, model: str, - longest_edge: Optional[int], - expected_max_tokens: int): - """Ensure get_max_idefics3_image_tokens handles mm_processor_kwargs.""" - size = {"longest_edge": longest_edge} if longest_edge is not None else None - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - mm_processor_kwargs=None, - ) - - actual_max_tokens = get_max_idefics3_image_tokens( - ctx=InputContext(ctx.model_config), - size=size, - ) - - assert expected_max_tokens == actual_max_tokens - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("longest_edge, toks_per_img, num_imgs", [ - (168, 169, 1), - (168, 169, 2), - (400, 338, 1), - (400, 338, 2), -]) -def test_dummy_data_override(dummy_data_for_idefics3, model: str, - longest_edge: int, toks_per_img: int, - num_imgs: int): - """Ensure dummy_data_for_idefics3 handles num_crops properly.""" - # Same as the previous test - don't initialize mm_processor_kwargs - # in this test and assume that the kwargs will be correctly expanded by - # the partial when calling the dummy data func. - size = {"longest_edge": longest_edge} if longest_edge is not None else None - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - mm_processor_kwargs=None, - ) - - dummy_data = dummy_data_for_idefics3( - ctx=ctx, - seq_len=8192, # Should be bigger than num_imgs * toks_per_img - mm_counts={"image": num_imgs}, - size=size) - sequence_data = dummy_data.seq_data - # Ensure we have the right number of placeholders per size - image_token_id = ctx.get_hf_config().image_token_id - img_tok_count = sequence_data.get_token_ids().count(image_token_id) - assert img_tok_count == toks_per_img * num_imgs - - @pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("longest_edge,expected_toks_per_img,num_imgs", [ - (336, 169 * (1**2 + 1), 1), - (336, 169 * (1**2 + 1), 2), - (400, 169 * (2**2 + 1), 1), - (400, 169 * (2**2 + 1), 2), -]) -def test_input_processor_override(input_processor_for_idefics3, - image_assets: _ImageAssets, model: str, - longest_edge: int, - expected_toks_per_img: int, num_imgs: int): +# yapf: disable +@pytest.mark.parametrize( + ("mm_processor_kwargs", "expected_toks_per_img"), + [ + ({"size": {"longest_edge": 364}}, 169), + ({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)), + ]) +# yapf: enable +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_override(image_assets: _ImageAssets, model: str, + mm_processor_kwargs: dict[str, object], + expected_toks_per_img: int, num_imgs: int): """Ensure input_processor_for_idefics3 handles num_crops properly.""" # Same as the previous test - don't initialize mm_processor_kwargs # in this test and assume that the kwargs will be correctly expanded by # the partial when calling the custom input processor. - size = {"longest_edge": longest_edge} if longest_edge is not None else None ctx = build_model_context( model_name=model, tokenizer_name=model, trust_remote_code=True, mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, ) + tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=tokenizer, + ) + hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs) # Build the image str / prompt based on the number of images we pass - tokenizer = AutoTokenizer.from_pretrained(model) placeholders = "" if num_imgs == 1 else "\n".join( f"Image-{i}: \n" for i in range(1, num_imgs + 1)) prompt = f"<|begin_of_text|>User:{placeholders}\n\nAssistant:" # noqa: E501 - images = [image_assets[0].pil_image.resize((336 * 4, 336 * 4))] * num_imgs - - inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), - prompt=prompt, - multi_modal_data={"image": images}) - processed_inputs = input_processor_for_idefics3(ctx, inputs, size=size) + # Build mm_data + image_size = ctx.get_hf_config(Idefics3Config).vision_config.image_size + dummy_image_size = (image_size * 4, image_size * 4) + dummy_image = image_assets[0].pil_image.resize(dummy_image_size) + mm_data = {"image": [dummy_image] * num_imgs} + + processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) + # Ensure the placeholders format are correct + hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"]) + assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[ + "input_ids"][0] # Ensure we have the right number of placeholders per num_crops size image_token_id = ctx.get_hf_config().image_token_id diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 0ec726b8b..cd4214439 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -31,6 +31,17 @@ C = TypeVar("C", bound=PretrainedConfig, default=PretrainedConfig) P = TypeVar("P", bound=ProcessorMixin, default=ProcessorMixin) +class HashableDict(dict): + """ + A dictionary that can be hashed by lru_cache. + """ + + # NOTE: pythonic dict is not hashable, + # we override on it directly for simplicity + def __hash__(self) -> int: # type: ignore[override] + return hash(frozenset(self.items())) + + @dataclass(frozen=True) class InputContext: """ @@ -104,6 +115,13 @@ class InputContext: if isinstance(typ, type): merged_kwargs["processor_cls"] = typ + # NOTE: Pythonic dict is not hashable and will raise unhashable type + # error when calling `cached_get_processor`, therefore we need to + # wrap it to a hashable dict. + for key, value in merged_kwargs.items(): + if isinstance(value, dict): + merged_kwargs[key] = HashableDict(value) + hf_processor = cached_get_processor( self.model_config.model, trust_remote_code=self.model_config.trust_remote_code, diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 9e2e677a6..fdfabbaaf 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -16,35 +16,35 @@ """Inference-only Idefics3 model compatible with HuggingFace weights.""" import math -from typing import (Dict, Iterable, List, Literal, Mapping, NamedTuple, - Optional, Set, Tuple, TypedDict, Union) +from typing import (Dict, Iterable, List, Literal, Mapping, Optional, Set, + Tuple, TypedDict, Union) import torch import torch.utils.checkpoint -from PIL import Image from torch import nn -# Temporary solution for transformers below 4.46.0. -from transformers import PretrainedConfig as Idefics3Config -from transformers import ProcessorMixin as Idefics3ImageProcessor +from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor, + Idefics3Processor) from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.logger import init_logger from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.inputs import NestedTensors -from vllm.sequence import IntermediateTensors, SequenceData -from vllm.transformers_utils.processor import cached_get_processor -from vllm.utils import is_list_of +from vllm.multimodal.parse import ImageProcessorItems +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, + MultiModalDataItems, + MultiModalFieldConfig, + PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs +from vllm.sequence import IntermediateTensors # yapf: disable from .idefics2_vision_model import ( @@ -77,307 +77,253 @@ class Idefics3ImageEmbeddingInputs(TypedDict): """ -class Idefics3ProcessorSize(NamedTuple): - """Hashable wrapper for unhashable `size` dict of Idefics3Processor.""" - # NOTE: cached_get_processor/cached_get_image_processor uses lru_cache, - # we need to use NamedTuple instead of TypedDict to avoid hashing issues. - longest_edge: int - - def __contains__(self, key: str) -> bool: - return key in self._asdict() and getattr(self, key) is not None - - def __getitem__(self, key: str) -> int: - return getattr(self, key) - - ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs] -def get_mm_processor_kwargs(size: Optional[Dict[str, int]] = None) -> Dict: - mm_processor_kwargs = {} - if size: - mm_processor_kwargs["size"] = Idefics3ProcessorSize(**size) - return mm_processor_kwargs - - -def input_mapper_for_idefics3( - ctx: InputContext, - data: object, - *, - size: Optional[Dict[str, int]] = None, -): - model_config = ctx.model_config - mm_processor_kwargs = get_mm_processor_kwargs(size) - image_processor = cached_get_image_processor( - model_config.model, - trust_remote_code=model_config.trust_remote_code, - **mm_processor_kwargs) - if image_processor is None: - raise RuntimeError("No HuggingFace processor is available " - "to process the image object") - - if isinstance(data, Image.Image): - images = [[data]] - elif is_list_of(data, Image.Image): - images = [data] - else: - raise TypeError(f"Invalid image type: {type(data)}") - - try: - batch_data = image_processor(images, - return_tensors="pt", - return_row_col_info=True).data - except Exception: - logger.error("Failed to process image (%s)", data) - raise - - return MultiModalKwargs(batch_data) - - -def _resize_output_size(height: int, - width: int, - max_len: Optional[int] = None, - min_len: Optional[int] = 1, - max_size: Optional[int] = None) -> Tuple[int, int]: - # Set default value for max_len if not provided - max_len = max(height, width) if max_len is None else max_len - aspect_ratio = width / height - - # Handle the maximum size constraint - if max_size is not None: - max_len = min(max_len, max_size) - - # Adjust dimensions according to the aspect ratio - if width >= height: - width = max_len - height = int(width / aspect_ratio) - else: - height = max_len - width = int(height * aspect_ratio) - - # Ensure both width and height are even (if needed) - height += 1 if height % 2 != 0 else 0 - width += 1 if width % 2 != 0 else 0 - - # Ensure dimensions are not smaller than the minimum length - height = max(height, min_len) - width = max(width, min_len) - - return height, width - - -def _get_resize_output_image_size( - image_size: Tuple[int, int], - resolution_max_side: int, - max_image_size: int = 1820, -) -> Tuple[int, int]: - if resolution_max_side > max_image_size: - raise ValueError( - "`resolution_max_side` cannot be larger than `max_image_size`") - - height, width = image_size - - # Find the output size, when rescaling the longest edge to max_len and - # preserving the aspect ratio - height, width = _resize_output_size(height, - width, - max_len=resolution_max_side) - - return height, width - - -def _prompt_split_image(image_seq_len: int, image_rows: int, image_cols: int, - fake_token_around_image: str, image_token: str, - global_img_token: str) -> str: - """ - Prompt with expanded image tokens for when the image is split - into patches. - """ - text_split_images = "" - for n_h in range(image_rows): - for n_w in range(image_cols): - text_split_images += (fake_token_around_image + - f"" + - image_token * image_seq_len) - text_split_images += "\n" - - text_split_images += "\n" + _prompt_single_image( - image_seq_len=image_seq_len, - fake_token_around_image=fake_token_around_image, - image_token=image_token, - global_img_token=global_img_token) - return text_split_images - - -def _prompt_single_image(image_seq_len: int, fake_token_around_image: str, - image_token: str, global_img_token: str): - """Prompt with expanded image tokens for a single image.""" - return (fake_token_around_image + global_img_token + - image_token * image_seq_len + fake_token_around_image) - - -def _get_image_prompt_string(image_rows: int, image_cols: int, - image_seq_len: int, fake_token_around_image: str, - image_token: str, global_img_token: str): - if image_rows == 0 and image_cols == 0: - return _prompt_single_image( - image_seq_len=image_seq_len, - fake_token_around_image=fake_token_around_image, - image_token=image_token, - global_img_token=global_img_token, - ) - return _prompt_split_image(image_seq_len, image_rows, image_cols, - fake_token_around_image, image_token, - global_img_token) - - -def input_processor_for_idefics3(ctx: InputContext, - inputs: DecoderOnlyInputs, - *, - size: Optional[Dict[str, int]] = None): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - model_config = ctx.model_config - mm_processor_kwargs = get_mm_processor_kwargs(size) - processor = cached_get_processor(model_config.model, **mm_processor_kwargs) - image_processor = processor.image_processor - tokenizer = processor.tokenizer - size = image_processor.size['longest_edge'] - max_image_size = image_processor.max_image_size['longest_edge'] - - image_data = multi_modal_data["image"] - if isinstance(image_data, Image.Image): - image_list = [image_data] - elif is_list_of(image_data, Image.Image): - image_list = image_data - else: - raise TypeError(f"Invalid image type: {type(image_data)}") - - image_rows = [] - image_cols = [] - for image in image_list: - height, width = _get_resize_output_image_size(image.size, size) - - rows = math.ceil(height / max_image_size) - cols = math.ceil(width / max_image_size) - image_rows.append(rows) - image_cols.append(cols) - image_rows = [image_rows] - image_cols = [image_cols] - - n_images_in_text = [] - - text = inputs.get("prompt") - if text is None: - prompt_token_ids = inputs.get("prompt_token_ids", []) - assert prompt_token_ids - text = tokenizer.decode(prompt_token_ids) - - if isinstance(text, str): - text = [text] - elif not isinstance(text, list) and not isinstance(text[0], str): - raise ValueError("Invalid input text. Please provide a string, " - "or a list of strings") - - fake_image_token = processor.fake_image_token.content - image_token = processor.image_token.content - global_img_token = processor.global_image_tag - - prompt_strings = [] - for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols): - n_images_in_text.append(sample.count(image_token)) - - # Replace the image token with fake tokens around the expanded - # image token sequence of length `image_seq_len` - image_prompt_strings = [] - for n_rows, n_cols in zip(sample_rows, sample_cols): - image_prompt_string = _get_image_prompt_string( - n_rows, - n_cols, - processor.image_seq_len, - image_token=image_token, - fake_token_around_image=fake_image_token, - global_img_token=global_img_token, - ) - image_prompt_strings.append(image_prompt_string) - - split_sample = sample.split(image_token) - if len(split_sample) == 0: - raise ValueError("The image token should be present in the text.") - - # Place in the image prompt strings where the image tokens are - sample = split_sample[0] - for i, image_prompt_string in enumerate(image_prompt_strings): - sample += image_prompt_string + split_sample[i + 1] - prompt_strings.append(sample) +class Idefics3ProcessingInfo(BaseProcessingInfo): - prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids + def get_hf_processor( + self, + *, + size: Optional[Dict[str, int]] = None) -> Idefics3Processor: + if size is not None: + return self.ctx.get_hf_processor(Idefics3Processor, size=size) - return token_inputs( - prompt_token_ids=prompt_token_ids, - prompt=prompt_strings[0], - multi_modal_data=multi_modal_data, - ) + return self.ctx.get_hf_processor(Idefics3Processor) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} -def _get_max_num_image_patch(image_processor: Idefics3ImageProcessor) -> int: - size = image_processor.size['longest_edge'] - max_image_size = image_processor.max_image_size['longest_edge'] - resized_height, resized_width = size, size + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + hf_processor = self.get_hf_processor() + image_processor: Idefics3ImageProcessor = hf_processor.image_processor + grid_w, grid_h = self._get_image_feature_grid_size( + image_width=image_processor.size['longest_edge'], + image_height=image_processor.size['longest_edge'], + ) + num_image_token = (grid_w * grid_h + 1) * hf_processor.image_seq_len + # Calculate Non-image-token length + # NOTE: and are special token for SmolVLM + # but not for Idefic3, so we need to tokenize them to get actual length. + tokenizer = self.get_tokenizer() + tile_token_len = len(tokenizer.tokenize("")) + glob_token_len = len(tokenizer.tokenize(hf_processor.global_image_tag)) + # linebreak and always cost 1 token + fake_token_len = lb_len = 1 + non_image_token = (grid_w * grid_h) * ( + tile_token_len + fake_token_len) + glob_token_len + ( + grid_h + 1) * lb_len + fake_token_len + return {"image": num_image_token + non_image_token} + + def _resize_output_size(self, + *, + height: int, + width: int, + max_len: Optional[int] = None, + min_len: Optional[int] = 1, + max_size: Optional[int] = None) -> tuple[int, int]: + # Set default value for max_len if not provided + max_len = max(height, width) if max_len is None else max_len + aspect_ratio = width / height + + # Handle the maximum size constraint + if max_size is not None: + max_len = min(max_len, max_size) + + # Adjust dimensions according to the aspect ratio + if width >= height: + width = max_len + height = int(width / aspect_ratio) + else: + height = max_len + width = int(height * aspect_ratio) - grid_h = resized_height // max_image_size - grid_w = resized_width // max_image_size - return (grid_h * grid_w + 1) + # Ensure both width and height are even (if needed) + height += height % 2 + width += width % 2 + # Ensure dimensions are not smaller than the minimum length + height = max(height, min_len) + width = max(width, min_len) -def get_max_idefics3_image_tokens(ctx: InputContext, - *, - size: Optional[Dict[str, - int]] = None) -> int: - model_config = ctx.model_config - mm_processor_kwargs = get_mm_processor_kwargs(size) - processor = cached_get_processor(model_config.model, **mm_processor_kwargs) - image_seq_len = processor.image_seq_len - image_processor = processor.image_processor + return height, width - max_num_image_patches = _get_max_num_image_patch(image_processor) + def _get_resize_output_image_size( + self, + *, + image_width: int, + image_height: int, + resolution_max_side: int, + ) -> tuple[int, int]: + hf_processor = self.get_hf_processor() + image_processor: Idefics3ImageProcessor = hf_processor.image_processor + max_image_size = image_processor.size['longest_edge'] + if resolution_max_side > max_image_size: + raise ValueError( + "`resolution_max_side` cannot be larger than `max_image_size`") + + height, width = image_height, image_width + + # Find the output size, when rescaling the longest edge to max_len and + # preserving the aspect ratio + height, width = self._resize_output_size(height=height, + width=width, + max_len=resolution_max_side) + return height, width + + def _get_image_feature_grid_size( + self, + *, + image_width: int, + image_height: int, + size: Optional[dict[str, object]] = None, + ) -> tuple[int, int]: + hf_processor = self.get_hf_processor(size=size) + image_processor: Idefics3ImageProcessor = hf_processor.image_processor + max_image_size = image_processor.max_image_size['longest_edge'] + size = image_processor.size['longest_edge'] + assert size % max_image_size == 0, ( + "`longest_edge` in image_processor's `size` must be divisible by " + "`longest_edge` in `max_image_size`, this may be caused by " + "incorrect mm_kwargs override.") + + resized_height, resized_width = self._get_resize_output_image_size( + image_width=image_width, + image_height=image_height, + resolution_max_side=size, + ) + if resized_height > max_image_size or resized_width > max_image_size: + grid_h = math.ceil(resized_height / max_image_size) + grid_w = math.ceil(resized_width / max_image_size) + else: + grid_h = grid_w = 0 + return grid_w, grid_h - return max_num_image_patches * image_seq_len +class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo] + ): -def dummy_data_for_idefics3( - ctx: InputContext, + def get_dummy_processor_inputs( + self, seq_len: int, mm_counts: Mapping[str, int], - *, - size: Optional[Dict[str, int]] = None) -> DummyData: - hf_config = ctx.get_hf_config() - num_images = mm_counts["image"] + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + hf_processor = self.info.get_hf_processor() + image_processor: Idefics3ImageProcessor = hf_processor.image_processor + longest_edge = image_processor.max_image_size['longest_edge'] + image_token: str = hf_processor.image_token.content + + mm_data = { + "image": + self._get_dummy_images(width=longest_edge, + height=longest_edge, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text=image_token * num_images, + mm_data=mm_data, + ) - mm_processor_kwargs = get_mm_processor_kwargs(size) - processor = cached_get_processor(ctx.model_config.model, - **mm_processor_kwargs) - max_num_image_patches = _get_max_num_image_patch(processor.image_processor) - image_seq_len = processor.image_seq_len - max_llm_image_tokens = max_num_image_patches * image_seq_len * num_images - if seq_len - max_llm_image_tokens < 0: - raise RuntimeError( - f"Idefics3 cannot process {num_images} images in a prompt, " - "please increase max_model_len or reduce image limit by " - "--limit-mm-per-prompt.") +class Idefics3MultimodalProcessor( + BaseMultiModalProcessor[Idefics3ProcessingInfo]): - seq_data = SequenceData.from_prompt_token_counts( - (hf_config.image_token_id, max_llm_image_tokens), - (0, seq_len - max_llm_image_tokens)) + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + if mm_data: + processed_outputs = super()._call_hf_processor( + prompt, mm_data, mm_kwargs) + image_grids = [ + self.info._get_image_feature_grid_size( + image_width=img.width, + image_height=img.height, + **mm_kwargs, + ) for img in mm_data["images"] + ] + image_patches = list(map(lambda x: math.prod(x) + 1, image_grids)) + for key in ("pixel_values", "pixel_attention_mask"): + data = processed_outputs.pop(key) + data = data.flatten(0, 1).split(image_patches) + processed_outputs[key] = data + else: + tokenizer = self.info.get_tokenizer() + processed_outputs = tokenizer(prompt, + add_special_tokens=True, + return_tensors="pt") + return processed_outputs - width = height = hf_config.vision_config.image_size - image = Image.new("RGB", (width, height), color=0) - mm_data = {"image": [image] if num_images == 1 else [image] * num_images} + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + pixel_attention_mask=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) - return DummyData(seq_data, mm_data) + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + image_token = hf_processor.image_token.content + fake_image_token = hf_processor.fake_image_token.content + global_img_token = hf_processor.global_image_tag + image_seq_len = hf_processor.image_seq_len + grid_placeholder = "" + + p_img = image_token * image_seq_len + global_img_placeholder = fake_image_token + global_img_token + p_img + tile_img_placeholder = fake_image_token + grid_placeholder + p_img + + def get_replacement_idefics3(item_idx: int) -> str: + images = mm_items.get_items("image", ImageProcessorItems) + + image_size = images.get_image_size(item_idx) + grid_w, grid_h = self.info._get_image_feature_grid_size( + image_width=image_size.width, + image_height=image_size.height, + **hf_processor_mm_kwargs, + ) + if grid_w == 0 and grid_h == 0: + image_placeholder = global_img_placeholder + else: + tiles_placeholder = list[str]() + for i in range(grid_h): + for j in range(grid_w): + placeholder_per_tile = tile_img_placeholder.format( + n_h=i + 1, n_w=j + 1) + tiles_placeholder.append(placeholder_per_tile) + # Add line break if it is the last tile in the row + if j == grid_w - 1: + tiles_placeholder.append("\n") + + image_placeholder = "".join( + [*tiles_placeholder, "\n", global_img_placeholder]) + return image_placeholder + fake_image_token + + return [ + PromptReplacement( + modality="image", + target=image_token, + replacement=get_replacement_idefics3, + ) + ] class Idefics3SimpleMLP(nn.Module): @@ -453,7 +399,7 @@ class Idefics3Model(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config = vllm_config.model_config.hf_config + config: Idefics3Config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config self.config = config @@ -541,15 +487,13 @@ class Idefics3Model(nn.Module): self, pixel_values: torch.Tensor, pixel_attention_mask: Optional[torch.BoolTensor] = None, - ) -> torch.Tensor: + ) -> NestedTensors: # NOTE: we skip the step to select the vision feature layer since # this is already done inside the vision tower - batch_size, num_images, num_channels, height, width = pixel_values.shape + num_patches = [x.size(0) for x in pixel_values] pixel_values = pixel_values.to( dtype=self.vision_model.embeddings.patch_embedding.weight.dtype ) # fp16 compatibility - pixel_values = pixel_values.view(batch_size * num_images, - *pixel_values.shape[2:]) # Remove padding images - padding images are full 0. nb_values_per_image = pixel_values.shape[1:].numel() @@ -567,8 +511,6 @@ class Idefics3Model(nn.Module): ) else: # Remove padding images from the mask - pixel_attention_mask = pixel_attention_mask.view( - batch_size * num_images, *pixel_attention_mask.shape[2:]) pixel_attention_mask = pixel_attention_mask[ real_images_inds].contiguous() @@ -587,10 +529,10 @@ class Idefics3Model(nn.Module): patch_attention_mask=patch_attention_mask, ) - return image_hidden_states + return image_hidden_states.split(num_patches) def _process_image_pixels( - self, inputs: Idefics3ImagePixelInputs) -> torch.Tensor: + self, inputs: Idefics3ImagePixelInputs) -> NestedTensors: assert self.vision_model is not None pixel_values = inputs["data"] @@ -605,7 +547,9 @@ class Idefics3Model(nn.Module): assert self.vision_model is not None image_features = self._process_image_pixels(image_input) - return self.connector(image_features) + num_patches = [x.size(0) for x in image_features] + image_features = torch.cat(image_features) + return self.connector(image_features).split(num_patches) def get_input_embeddings( self, @@ -634,10 +578,10 @@ class Idefics3Model(nn.Module): return hidden_states -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_idefics3) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_idefics3_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_idefics3) -@INPUT_REGISTRY.register_input_processor(input_processor_for_idefics3) +@MULTIMODAL_REGISTRY.register_processor( + Idefics3MultimodalProcessor, + info=Idefics3ProcessingInfo, + dummy_inputs=Idefics3DummyInputsBuilder) class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA): packed_modules_mapping = { @@ -689,7 +633,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, if self.config.text_config.tie_word_embeddings: self.lm_head.weight = self.model.text_model.wte.weight self.logits_processor = LogitsProcessor(config.text_config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self.model._parse_and_validate_image_input(**kwargs) -- GitLab From 6469038b149425e25b77c1ef93adf0e7712fd100 Mon Sep 17 00:00:00 2001 From: Michael Greenbaum <48786769+mgtk77@users.noreply.github.com> Date: Tue, 4 Feb 2025 14:58:48 +0200 Subject: [PATCH 33/65] [Bugfix] Fix loading of fine-tuned models based on Phi-3-Small (#12689) Signed-off-by: Michael Greenbaum Co-authored-by: Michael Greenbaum --- vllm/model_executor/models/phi3_small.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index a8b7e9b2a..873e9d377 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -476,6 +476,8 @@ class Phi3SmallForCausalLM(nn.Module, SupportsPP): continue if is_pp_missing_parameter(name, self): continue + if "lm_head.weight" in name and self.config.tie_word_embeddings: + continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) -- GitLab From 62467a834a94566e2d81a276817a20174b474151 Mon Sep 17 00:00:00 2001 From: Kero Liang Date: Tue, 4 Feb 2025 21:03:19 +0800 Subject: [PATCH 34/65] Avoid unnecessary multi-modal input data copy when len(batch) == 1 (#12722) Signed-off-by: imkero --- vllm/multimodal/inputs.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index fe24c7282..8e4af7f88 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -212,6 +212,11 @@ class MultiModalBatchedField(BaseMultiModalField): def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): + if len(batch) == 1: + # An optimization when `batch` contains only one tensor: + # - produce exactly same result as `torch.stack(batch)` + # - will achieve zero-copy if the tensor is contiguous + return batch[0].unsqueeze(0).contiguous() first_shape = batch[0].shape if all(elem.shape == first_shape for elem in batch): return torch.stack(batch) @@ -235,6 +240,11 @@ class MultiModalFlatField(BaseMultiModalField): def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): + if len(batch) == 1: + # An optimization when `batch` contains only one tensor: + # - produce exactly same result as `torch.concat(batch)` + # - will achieve zero-copy if the tensor is contiguous + return batch[0].contiguous() first_shape = batch[0].shape if all(elem.shape[1:] == first_shape[1:] for elem in batch): return torch.concat(batch) @@ -407,6 +417,12 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): return stacked tensors_ = cast(list[torch.Tensor], stacked) + if len(tensors_) == 1: + # An optimization when `tensors_` contains only one tensor: + # - produce exactly same result as `torch.stack(tensors_)` + # - will achieve zero-copy if the tensor is contiguous + return tensors_[0].unsqueeze(0).contiguous() + if any(t.shape != tensors_[0].shape for t in tensors_): # The tensors have incompatible shapes and can't be stacked. return tensors_ -- GitLab From 649550f27e4ad35e7e3352800438991fdaf150a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sophie=20du=20Cou=C3=A9dic?= Date: Tue, 4 Feb 2025 14:19:12 +0100 Subject: [PATCH 35/65] [Build] update requirements of no-device for plugin usage (#12630) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sophie du Couédic --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 50265d46e..a4043c43a 100755 --- a/setup.py +++ b/setup.py @@ -556,7 +556,7 @@ def get_requirements() -> List[str]: return resolved_requirements if _no_device(): - requirements = _read_requirements("requirements-cpu.txt") + requirements = _read_requirements("requirements-common.txt") elif _is_cuda(): requirements = _read_requirements("requirements-cuda.txt") cuda_major, cuda_minor = torch.version.cuda.split(".") -- GitLab From 18016a5e627d2a4b69af599272a5aa8ce71b98c8 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 4 Feb 2025 23:54:23 +0800 Subject: [PATCH 36/65] [Bugfix] Fix CI failures for InternVL and Mantis models (#12728) Signed-off-by: DarkLight1337 --- .../vision_language/test_models.py | 17 +- tests/models/registry.py | 3 +- tests/multimodal/test_processing.py | 69 +++ tests/multimodal/test_processor_kwargs.py | 402 ------------------ 4 files changed, 79 insertions(+), 412 deletions(-) delete mode 100644 tests/multimodal/test_processor_kwargs.py diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 5fe46bd75..85bc4ac13 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -9,6 +9,7 @@ from pathlib import PosixPath from typing import Type import pytest +from packaging.version import Version from transformers import AutoModelForVision2Seq from transformers import __version__ as TRANSFORMERS_VERSION @@ -154,13 +155,7 @@ VLM_TEST_SETTINGS = { stop_str=["<|im_end|>"], image_size_factors=[(0.10, 0.15)], max_tokens=64, - marks=[ - pytest.mark.skipif( - TRANSFORMERS_VERSION < "4.48.0", - reason="HF model requires transformers>=4.48.0", - ), - large_gpu_mark(min_gb=64), - ], + marks=[large_gpu_mark(min_gb=64)], ), "blip2": VLMTestInfo( models=["Salesforce/blip2-opt-2.7b"], @@ -206,7 +201,7 @@ VLM_TEST_SETTINGS = { image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], marks=[ pytest.mark.skipif( - TRANSFORMERS_VERSION >= "4.48.0", + Version(TRANSFORMERS_VERSION) >= Version("4.48"), reason="HF model is not compatible with transformers>=4.48.0", ) ], @@ -339,6 +334,12 @@ VLM_TEST_SETTINGS = { auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output, patch_hf_runner=model_utils.mantis_patch_hf_runner, + marks=[ + pytest.mark.skipif( + Version(TRANSFORMERS_VERSION) >= Version("4.48"), + reason="HF model is not compatible with transformers>=4.48.0", + ) + ], ), "minicpmv_25": VLMTestInfo( models=["openbmb/MiniCPM-Llama3-V-2_5"], diff --git a/tests/models/registry.py b/tests/models/registry.py index 7b5032f79..285fbe484 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -224,8 +224,7 @@ _CROSS_ENCODER_EXAMPLE_MODELS = { _MULTIMODAL_EXAMPLE_MODELS = { # [Decoder-only] - "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria", - min_transformers_version="4.48"), + "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"), "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"), # noqa: E501 "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501 "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b", diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 6cccd2aa2..459c0d9d1 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -1,11 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 from contextlib import nullcontext +from types import MethodType from typing import cast from unittest.mock import MagicMock import numpy as np import pytest +from transformers import ProcessorMixin from vllm.config import ModelConfig from vllm.multimodal import MULTIMODAL_REGISTRY @@ -636,3 +638,70 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): mm_data=mm_data, hf_processor_mm_kwargs={}, ) + + +class _ProcessorProxy: + + def __init__(self, processor: ProcessorMixin) -> None: + super().__init__() + + self.__processor = processor + + def __getattr__(self, key: str): + return getattr(self.__processor, key) + + def __call__( + self, + text=None, + images=None, + videos=None, + exists=None, + return_tensors=None, + ): + return dict(exists=exists) + + +@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-7B-Instruct"]) # Dummy +# yapf: disable +@pytest.mark.parametrize( + ("call_kwargs", "expected_kwargs"), + [ + # Should ignore invalid kwargs + ({"does_not_exist": 100}, {"exists": None}), + ({"exists": 1}, {"exists": 1}), + ({"does_not_exist": 100, "exists": 1}, {"exists": 1}), + ], +) +# yapf: enable +def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs): + model_config = ModelConfig( + model=model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="half", + revision=None, + ) + + processor = MULTIMODAL_REGISTRY.create_processor( + model_config, + tokenizer=cached_get_tokenizer(model_config.tokenizer), + ) + orig_get_hf_processor = processor.info.get_hf_processor + + def get_hf_processor(self, **kwargs): + assert kwargs == call_kwargs + return _ProcessorProxy(orig_get_hf_processor()) + + processor.info.get_hf_processor = MethodType(get_hf_processor, + processor.info) + + out_kwargs = processor._call_hf_processor( + prompt="", + mm_data={}, + mm_kwargs=call_kwargs, + ) + + assert out_kwargs == expected_kwargs diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py deleted file mode 100644 index 5d18b2ed7..000000000 --- a/tests/multimodal/test_processor_kwargs.py +++ /dev/null @@ -1,402 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from array import array -from typing import Callable, Dict, Mapping, Optional -from unittest.mock import patch - -import pytest -import torch - -from vllm.inputs import (DecoderOnlyInputs, DummyData, InputContext, - InputRegistry, ProcessorInputs, token_inputs) -from vllm.multimodal import MultiModalRegistry -from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData - -from ..models.utils import build_model_context - -# Used for fast tests where the model doesn't matter -DUMMY_MODEL_ID = "facebook/opt-125m" -# Used for tests that need a multimodal model -MULTIMODAL_MODEL_ID = "OpenGVLab/InternVL2-2B" - -# For mm_processor_kwargs - we test overrides by defining mocks for each place -# it is used, and ensuring that we can pass processor kwargs an override value -# to receive the intended result for things like sequence length etc. -DEFAULT_MAX_DYNAMIC_PATCH = 6 -MAX_DYNAMIC_PATCH_OVERRIDE = 4 - - -# Mocks for all of the places that we use the mm_processor_kwargs -# to override values in different callables -@pytest.fixture -def use_processor_mock(): - """Patches the internal model input processor with an override callable.""" - - def custom_processor(ctx: InputContext, - inputs: DecoderOnlyInputs, - *, - max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH): - # For testing purposes, we don't worry about the prompt - return token_inputs( - prompt_token_ids=[], - mm_processor_kwargs={"max_dynamic_patch": max_dynamic_patch}) - - with patch("vllm.inputs.registry.InputRegistry._get_model_input_processor", - return_value=custom_processor): - yield - - -@pytest.fixture -def use_dummy_data_mock(): - """Patches the internal model input processor with an override callable.""" - - def custom_dummy_data_factory(self, - ctx: InputContext, - seq_len: int, - mm_counts: Mapping[str, int], - *, - max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH): - seq_data = SequenceData( - array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * max_dynamic_patch)) - return DummyData(seq_data, None) - - with patch( - "vllm.inputs.registry.InputRegistry._default_dummy_data_factory", - custom_dummy_data_factory): - yield - - -# Lazy import to avoid CUDA reinitialization error -def mm_model_cls(): - from vllm.model_executor.models.internvl import InternVLChatModel - - return InternVLChatModel - - -# lambda whose signature matches max token calcs extra & mapper + extra kwargs -get_max_dynamic_patch = lambda ctx, *, max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH: max_dynamic_patch # noqa: E501 -custom_mapper = lambda ctx, data, *, max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH: { # noqa: E501 - "pixel_values": torch.zeros(size=(1, max_dynamic_patch + 1, 3, 448, 448)) -} - - -### Tests for default processor logic & mm_processor_kwargs wrapping -def test_default_processor_is_a_noop(): - """Ensure that by default, there is no processor override.""" - dummy_registry = InputRegistry() - ctx = build_model_context(DUMMY_MODEL_ID) - processor = dummy_registry.create_input_processor(ctx.model_config) - proc_inputs = token_inputs(prompt_token_ids=[], prompt="") - proc_outputs = processor(inputs=proc_inputs) - assert proc_inputs is proc_outputs - - -def _get_max_dynamic_patch_info(init_max_dynamic_patch: int, - inference_max_dynamic_patch: int): - """Get the init / inference kwargs and expected max_dynamic_patch.""" - # If we have a value for max_dynamic_patch, pass the override value and make - # sure we get that value as a return-value from out mock processor, - # otherwise fall back to the default value - init_kwargs = None if init_max_dynamic_patch is None else { - "max_dynamic_patch": init_max_dynamic_patch - } - inference_kwargs = None if inference_max_dynamic_patch is None else { - "max_dynamic_patch": inference_max_dynamic_patch - } - if inference_max_dynamic_patch is not None: - expected_seq_count = inference_max_dynamic_patch - elif init_max_dynamic_patch is not None: - expected_seq_count = init_max_dynamic_patch - else: - expected_seq_count = DEFAULT_MAX_DYNAMIC_PATCH - return init_kwargs, inference_kwargs, expected_seq_count - - -def _get_processed_max_dynamic_patch( - processor: Callable[[ProcessorInputs], ProcessorInputs], - inference_kwargs: Optional[Dict[str, int]], -) -> int: - processed_inputs = processor( - token_inputs(prompt_token_ids=[], - prompt="", - mm_processor_kwargs=inference_kwargs)) - - assert "type" in processed_inputs - assert processed_inputs["type"] == "token" - assert "mm_processor_kwargs" in processed_inputs - return processed_inputs["mm_processor_kwargs"]["max_dynamic_patch"] - - -@pytest.mark.parametrize( - "init_max_dynamic_patch,inference_max_dynamic_patch", [ - (None, None), - (MAX_DYNAMIC_PATCH_OVERRIDE, None), - (DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE), - ]) -def test_input_processor_kwargs(use_processor_mock, init_max_dynamic_patch, - inference_max_dynamic_patch): - """Ensure input processors can use processor kwargs.""" - dummy_registry = InputRegistry() - - (init_kwargs, inference_kwargs, - expected_seq_count) = _get_max_dynamic_patch_info( - init_max_dynamic_patch, inference_max_dynamic_patch) - - ctx = build_model_context(DUMMY_MODEL_ID, mm_processor_kwargs=init_kwargs) - processor = dummy_registry.create_input_processor(ctx.model_config) - max_dynamic_patch_val = _get_processed_max_dynamic_patch( - processor, inference_kwargs) - - assert max_dynamic_patch_val == expected_seq_count - - -@pytest.mark.parametrize( - "mm_processor_kwargs", - [ - # Not part of the signature - { - "does_not_exist": 100 - }, - # Part of the signature, not keyword only - { - "ctx": "something bad" - } - ]) -def test_processor_with_sad_kwarg_overrides(use_processor_mock, - mm_processor_kwargs): - """Ensure that input processors filter out invalid mm_processor_kwargs""" - dummy_registry = InputRegistry() - # Should filter out the init time kwargs - ctx = build_model_context(DUMMY_MODEL_ID, - mm_processor_kwargs=mm_processor_kwargs) - - processor = dummy_registry.create_input_processor(ctx.model_config) - # Should filter out the inference time kwargs - max_dynamic_patch_val = _get_processed_max_dynamic_patch( - processor, mm_processor_kwargs) - assert max_dynamic_patch_val == DEFAULT_MAX_DYNAMIC_PATCH - - -### Test overrides for the dummy data -@pytest.mark.parametrize("max_dynamic_patch", - [None, MAX_DYNAMIC_PATCH_OVERRIDE]) -def test_dummy_data_kwarg_overrides(use_dummy_data_mock, max_dynamic_patch): - """Ensure dummy data factories can use processor kwargs.""" - mm_processor_kwargs = None if max_dynamic_patch is None else { - "max_dynamic_patch": max_dynamic_patch - } - expected_seq_count = (DEFAULT_MAX_DYNAMIC_PATCH - if max_dynamic_patch is None else max_dynamic_patch) - dummy_registry = InputRegistry() - ctx = build_model_context(DUMMY_MODEL_ID, - mm_processor_kwargs=mm_processor_kwargs) - mm_registry = MultiModalRegistry() - mm_registry.init_mm_limits_per_prompt(ctx.model_config) - - # NOTE: seq_len is thrown away here since this will leverage the - # default dummy data factory that we have patched in, whose seq - # len is solely dependent on the value of the mm_processor_kwargs. - dummy_data = dummy_registry.dummy_data_for_profiling( - ctx.model_config, seq_len=-1, mm_registry=mm_registry) - assert len(dummy_data.seq_data.prompt_token_ids) == expected_seq_count - - -@pytest.mark.parametrize( - "mm_processor_kwargs", - [ - # Not part of the signature - { - "does_not_exist": 100 - }, - # Part of the signature, not keyword only - { - "ctx": "something bad" - } - ]) -def test_dummy_data_with_sad_kwarg_overrides(use_dummy_data_mock, - mm_processor_kwargs): - """Ensure the dummy data factory filters out invalid mm_processor_kwargs""" - dummy_registry = InputRegistry() - ctx = build_model_context(DUMMY_MODEL_ID, - mm_processor_kwargs=mm_processor_kwargs) - mm_registry = MultiModalRegistry() - mm_registry.init_mm_limits_per_prompt(ctx.model_config) - - # NOTE: seq_len is thrown away here since this will leverage the - # default dummy data factory that we have patched in, whose seq - # len is solely dependent on the value of the mm_processor_kwargs. - dummy_data = dummy_registry.dummy_data_for_profiling( - ctx.model_config, seq_len=-1, mm_registry=mm_registry) - assert len( - dummy_data.seq_data.prompt_token_ids) == DEFAULT_MAX_DYNAMIC_PATCH - - -### Test overrides for the max token count per multimodal instance -@pytest.mark.parametrize("max_dynamic_patch", - [None, MAX_DYNAMIC_PATCH_OVERRIDE]) -def test_max_tokens_kwarg_overrides(max_dynamic_patch): - """Ensure max token calcs can use processor kwargs.""" - mm_processor_kwargs = None if max_dynamic_patch is None else { - "max_dynamic_patch": max_dynamic_patch - } - expected_seq_count = (DEFAULT_MAX_DYNAMIC_PATCH - if max_dynamic_patch is None else max_dynamic_patch) - - ctx = build_model_context(MULTIMODAL_MODEL_ID, - task="generate", - trust_remote_code=True, - mm_processor_kwargs=mm_processor_kwargs, - limit_mm_per_prompt={"image": 1}) - - mm_registry = MultiModalRegistry() - mm_registry.init_mm_limits_per_prompt(ctx.model_config) - # Patch the image registry for phi3v with our lambda that is compatible - # with overrides, then ensure that calling the method correctly echos - # our max_dynamic_patch value back from the mm_processor_kwargs. - with patch.object( - mm_registry._get_plugin("image"), - "_max_mm_tokens", - {mm_model_cls(): get_max_dynamic_patch}, - ): - max_multimodal_tokens = mm_registry.get_max_multimodal_tokens( - ctx.model_config) - - assert expected_seq_count == max_multimodal_tokens - - -@pytest.mark.parametrize( - "mm_processor_kwargs", - [ - # Not part of the signature - { - "does_not_exist": 100 - }, - # Part of the signature, not keyword only - { - "ctx": "something bad" - } - ]) -def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs): - """Ensure that max token calcs filters out invalid mm_processor_kwargs""" - ctx = build_model_context(MULTIMODAL_MODEL_ID, - task="generate", - trust_remote_code=True, - mm_processor_kwargs=mm_processor_kwargs, - limit_mm_per_prompt={"image": 1}) - - mm_registry = MultiModalRegistry() - mm_registry.init_mm_limits_per_prompt(ctx.model_config) - - # Similar before, but since these kwargs get filtered, - # we always get our default value back. - with patch.object( - mm_registry._get_plugin("image"), - "_max_mm_tokens", - {mm_model_cls(): get_max_dynamic_patch}, - ): - max_multimodal_tokens = mm_registry.get_max_multimodal_tokens( - ctx.model_config) - - assert max_multimodal_tokens == DEFAULT_MAX_DYNAMIC_PATCH - - -### Test overrides for the mapper -@pytest.mark.parametrize( - "max_dynamic_patch", - [DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE]) -def test_default_mapper_with_processor_kwargs(image_assets, max_dynamic_patch): - """Ensure that the mapper processor kwargs can fall back to HF models.""" - # NOTE - we don't validate bad inputs for the default mapper, because it's - # through the automodel interface in transformers, so we can't easily - # inspect what kwargs are or are not allowed. - ctx = build_model_context( - MULTIMODAL_MODEL_ID, - task="generate", - trust_remote_code=True, - mm_processor_kwargs={"max_dynamic_patch": max_dynamic_patch}, - limit_mm_per_prompt={"image": 1}) - - mm_registry = MultiModalRegistry() - mm_registry.init_mm_limits_per_prompt(ctx.model_config) - - image = image_assets[0].pil_image - mm_inputs = {"image": image} - - mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs) - # pixel vals should have shape: [batch, max_dynamic_patch+1, ...] - assert mapped_inputs["pixel_values"].shape[1] == max_dynamic_patch + 1 - - -@pytest.mark.parametrize( - "init_max_dynamic_patch,inference_max_dynamic_patch", [ - (None, None), - (MAX_DYNAMIC_PATCH_OVERRIDE, None), - (DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE), - ]) -def test_custom_mapper_kwarg_overrides(image_assets, init_max_dynamic_patch, - inference_max_dynamic_patch): - """Ensure custom mappers can use processor kwargs.""" - (init_kwargs, inference_kwargs, - expected_seq_count) = _get_max_dynamic_patch_info( - init_max_dynamic_patch, inference_max_dynamic_patch) - - ctx = build_model_context(MULTIMODAL_MODEL_ID, - task="generate", - trust_remote_code=True, - mm_processor_kwargs=init_kwargs, - limit_mm_per_prompt={"image": 1}) - - mm_registry = MultiModalRegistry() - mm_registry.init_mm_limits_per_prompt(ctx.model_config) - image = image_assets[0].pil_image - mm_inputs = {"image": image} - - # Patch the image registry for phi3v with our lambda that is compatible - # with overrides, then ensure that calling the method correctly echos - # our max_dynamic_patch value back from the mm_processor_kwargs. - mm_registry._get_plugin("image").register_input_mapper(custom_mapper)( - mm_model_cls()) - mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs, - inference_kwargs) - - assert mapped_inputs["pixel_values"].shape[1] == expected_seq_count + 1 - - -@pytest.mark.parametrize( - "mm_processor_kwargs", - [ - # Not part of the signature - { - "does_not_exist": 100 - }, - # Part of the signature, not keyword only - { - "ctx": "something bad" - } - ]) -def test_custom_mapper_with_sad_kwarg_overrides(image_assets, - mm_processor_kwargs): - """Ensure that custom mappers filters out invalid mm_processor_kwargs""" - # Should filter out the init time kwargs - ctx = build_model_context(MULTIMODAL_MODEL_ID, - task="generate", - trust_remote_code=True, - mm_processor_kwargs=mm_processor_kwargs, - limit_mm_per_prompt={"image": 1}) - - mm_registry = MultiModalRegistry() - mm_registry.init_mm_limits_per_prompt(ctx.model_config) - image = image_assets[0].pil_image - mm_inputs = {"image": image} - - # Patch the image registry for phi3v with our lambda that is compatible - # with overrides, then ensure that calling the method correctly echos - # our max_dynamic_patch value back from the mm_processor_kwargs. - mm_registry._get_plugin("image").register_input_mapper(custom_mapper)( - mm_model_cls()) - # Should filter out the inference time kwargs - mapped_inputs = mm_registry.map_input( - ctx.model_config, mm_inputs, mm_processor_kwargs=mm_processor_kwargs) - - assert mapped_inputs["pixel_values"].shape[1] == ( - DEFAULT_MAX_DYNAMIC_PATCH + 1) -- GitLab From 233df6f5c4520ae57e4a24acfbaedcc9ce166074 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Wed, 5 Feb 2025 00:46:54 +0000 Subject: [PATCH 37/65] [V1][Metrics] Add request_success_total counter, labelled with finish reason (#12579) Signed-off-by: Mark McLoughlin --- tests/entrypoints/openai/test_metrics.py | 1 + vllm/v1/engine/__init__.py | 21 +++++++++++++++++++-- vllm/v1/engine/detokenizer.py | 9 +++++---- vllm/v1/engine/output_processor.py | 22 ++++++++++++---------- vllm/v1/metrics/loggers.py | 15 ++++++++++++++- vllm/v1/metrics/stats.py | 10 +++++++--- vllm/v1/request.py | 15 ++++++++------- 7 files changed, 66 insertions(+), 27 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index a9134be62..de2333901 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -205,6 +205,7 @@ EXPECTED_METRICS_V1 = [ "vllm:gpu_cache_usage_perc", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", + "vllm:request_success_total", "vllm:request_prompt_tokens_sum", "vllm:request_prompt_tokens_bucket", "vllm:request_prompt_tokens_count", diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 912b92862..6bd548bdc 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -15,6 +15,23 @@ if TYPE_CHECKING: from vllm.sampling_params import SamplingParams +class RequestFinishedReason(enum.IntEnum): + """ + Reason a request finished - stop, length, or abort. + + stop - a stop string was emitted + length - max_tokens was consumed, or max_model_len was reached + abort - aborted for another reason + + """ + STOP = 0 + LENGTH = 1 + ABORT = 2 + + def __str__(self): + return self.name.lower() + + @dataclass class EngineCoreRequest: @@ -45,7 +62,7 @@ class EngineCoreOutput( request_id: str new_token_ids: List[int] finished: bool - finish_reason: Optional[str] = None + finish_reason: Optional[RequestFinishedReason] = None stop_reason: Union[int, str, None] = None @@ -56,7 +73,7 @@ class EngineCoreOutputs( gc=False): # type: ignore[call-arg] #NOTE(Nick): We could consider ways to make this more compact, - # e.g. columnwise layout and using an int enum for finish/stop reason + # e.g. columnwise layout # [num_reqs] outputs: List[EngineCoreOutput] diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 6d800f026..2bce23e68 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -8,7 +8,8 @@ from vllm.logger import init_logger from vllm.sampling_params import RequestOutputKind from vllm.transformers_utils.detokenizer_utils import ( AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) -from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest +from vllm.v1.engine import (EngineCoreOutput, EngineCoreRequest, + RequestFinishedReason) logger = init_logger(__name__) @@ -18,7 +19,7 @@ class DetokenizerOutput: output_text: str token_ids: List[int] finished: bool - finish_reason: Optional[str] = None + finish_reason: Optional[RequestFinishedReason] = None stop_reason: Union[int, str, None] = None @@ -147,13 +148,13 @@ class IncrementalDetokenizer: stop_str, truncate_to = stop if truncate_to != -1: self.output_text = self.output_text[:truncate_to] - finish_reason = "stop" # TODO: use constant + finish_reason = RequestFinishedReason.STOP stop_reason = stop_str # TODO: handle stop_token_ids here too? # 3) Update the RequestOutput object with the new text. - finished = bool(finish_reason) + finished = finish_reason is not None if self.output_kind == RequestOutputKind.FINAL_ONLY \ and not finished: return None diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index aeefd5239..947366691 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -161,8 +161,10 @@ class OutputProcessor: engine_core_output) # 3) Create and handle RequestOutput objects. - if request_output := self._make_request_output( - req_state, detokenizer_output): + if detokenizer_output is not None: + request_output = self._make_request_output( + req_state, detokenizer_output) + if req_state.queue is not None: # AsyncLLM: put into queue for handling by generate(). req_state.queue.put_nowait(request_output) @@ -172,6 +174,8 @@ class OutputProcessor: # Free completed requests. if request_output.finished: + assert detokenizer_output.finish_reason is not None + self.request_states.pop(req_id) if not engine_core_output.finished: # If req not finished in EngineCore, but Detokenizer @@ -180,7 +184,8 @@ class OutputProcessor: # Track per-request stats iteration_stats.update_from_finished_request( - request_output, req_state.stats) + detokenizer_output.finish_reason, request_output, + req_state.stats) return OutputProcessorOutput( request_outputs=request_outputs, @@ -191,12 +196,8 @@ class OutputProcessor: @staticmethod def _make_request_output( request_state: RequestState, - detokenizer_output: Optional[DetokenizerOutput], - ) -> Optional[RequestOutput]: - - if detokenizer_output is None: - return None - + detokenizer_output: DetokenizerOutput, + ) -> RequestOutput: request_output = RequestOutput.new( request_state.request_id, request_state.prompt, @@ -207,7 +208,8 @@ class OutputProcessor: ) if detokenizer_output.finished: completion_output = request_output.outputs[0] - completion_output.finish_reason = detokenizer_output.finish_reason + completion_output.finish_reason = str( + detokenizer_output.finish_reason) completion_output.stop_reason = detokenizer_output.stop_reason return request_output diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index f736e38f1..b62351a8f 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -2,13 +2,14 @@ import time from abc import ABC, abstractmethod -from typing import List +from typing import Dict, List import numpy as np import prometheus_client from vllm.config import ModelConfig from vllm.logger import init_logger +from vllm.v1.engine import RequestFinishedReason from vllm.v1.metrics.stats import IterationStats, SchedulerStats logger = init_logger(__name__) @@ -116,6 +117,17 @@ class PrometheusStatLogger(StatLoggerBase): documentation="Number of generation tokens processed.", labelnames=labelnames).labels(*labelvalues) + self.counter_request_success: Dict[RequestFinishedReason, + prometheus_client.Counter] = {} + counter_request_success_base = prometheus_client.Counter( + name="vllm:request_success_total", + documentation="Count of successfully processed requests.", + labelnames=labelnames + ["finished_reason"]) + for reason in RequestFinishedReason: + self.counter_request_success[ + reason] = counter_request_success_base.labels(*(labelvalues + + [str(reason)])) + self.histogram_num_prompt_tokens_request = \ prometheus_client.Histogram( name="vllm:request_prompt_tokens", @@ -163,6 +175,7 @@ class PrometheusStatLogger(StatLoggerBase): iteration_stats.num_generation_tokens) for finished_request in iteration_stats.finished_requests: + self.counter_request_success[finished_request.finish_reason].inc() self.histogram_num_prompt_tokens_request.observe( finished_request.num_prompt_tokens) self.histogram_num_generation_tokens_request.observe( diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 88f2c0835..36c95e07d 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List if TYPE_CHECKING: from vllm.outputs import RequestOutput - from vllm.v1.engine import EngineCoreOutput + from vllm.v1.engine import EngineCoreOutput, RequestFinishedReason @dataclass @@ -32,6 +32,7 @@ class RequestStateStats: class FinishedRequestStats: """Stats associated with a finished request.""" + finish_reason: "RequestFinishedReason" num_prompt_tokens: int = 0 num_generation_tokens: int = 0 @@ -73,8 +74,11 @@ class IterationStats: request_state_stats.num_generation_tokens += num_new_generation_tokens request_state_stats.last_token_time = now - def update_from_finished_request(self, request_output: "RequestOutput", + def update_from_finished_request(self, + finish_reason: "RequestFinishedReason", + request_output: "RequestOutput", request_state_stats: RequestStateStats): self.finished_requests.append( - FinishedRequestStats(len(request_output.prompt_token_ids), + FinishedRequestStats(finish_reason, + len(request_output.prompt_token_ids), request_state_stats.num_generation_tokens)) diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 0519d9e78..eb9bf99b4 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List, Optional, Union from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams from vllm.sequence import RequestMetrics -from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine import EngineCoreRequest, RequestFinishedReason from vllm.v1.utils import ConstantList if TYPE_CHECKING: @@ -109,7 +109,7 @@ class Request: def is_finished(self) -> bool: return RequestStatus.is_finished(self.status) - def get_finished_reason(self) -> Union[str, None]: + def get_finished_reason(self) -> Union[RequestFinishedReason, None]: return RequestStatus.get_finished_reason(self.status) def has_encoder_inputs(self) -> bool: @@ -149,7 +149,8 @@ class RequestStatus(enum.IntEnum): return status > RequestStatus.PREEMPTED @staticmethod - def get_finished_reason(status: "RequestStatus") -> Union[str, None]: + def get_finished_reason( + status: "RequestStatus") -> Union[RequestFinishedReason, None]: return _FINISHED_REASON_MAP.get(status) @@ -158,8 +159,8 @@ class RequestStatus(enum.IntEnum): # are longer than the model's length cap. Therefore, the stop # reason should also be "length" as in OpenAI API. _FINISHED_REASON_MAP = { - RequestStatus.FINISHED_STOPPED: "stop", - RequestStatus.FINISHED_LENGTH_CAPPED: "length", - RequestStatus.FINISHED_ABORTED: "abort", - RequestStatus.FINISHED_IGNORED: "length", + RequestStatus.FINISHED_STOPPED: RequestFinishedReason.STOP, + RequestStatus.FINISHED_LENGTH_CAPPED: RequestFinishedReason.LENGTH, + RequestStatus.FINISHED_ABORTED: RequestFinishedReason.ABORT, + RequestStatus.FINISHED_IGNORED: RequestFinishedReason.LENGTH, } -- GitLab From 75e94309e8d8919e0daea041f6cd81a4b8c09060 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 4 Feb 2025 21:22:24 -0500 Subject: [PATCH 38/65] [Perf] Mem align KV caches for CUDA devices (MLA perf improvement) (#12676) Signed-off-by: simon-mo Signed-off-by: Lucas Wilkinson Signed-off-by: Lucas Wilkinson Signed-off-by: Lucas Wilkinson Co-authored-by: simon-mo --- csrc/cache.h | 3 + csrc/cache_kernels.cu | 82 +++++- csrc/torch_bindings.cpp | 4 + tests/kernels/test_cache.py | 262 ++++++++++++++++++ vllm/_custom_ops.py | 5 + vllm/attention/backends/triton_mla.py | 5 +- vllm/attention/ops/triton_decode_attention.py | 16 +- vllm/envs.py | 10 + vllm/utils.py | 10 + vllm/worker/cache_engine.py | 66 ++++- 10 files changed, 429 insertions(+), 34 deletions(-) diff --git a/csrc/cache.h b/csrc/cache.h index 55ed30bd8..cf4a65c29 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -15,6 +15,9 @@ void copy_blocks(std::vector const& key_caches, std::vector const& value_caches, const torch::Tensor& block_mapping); +void copy_blocks_mla(std::vector const& kv_caches, + const torch::Tensor& block_mapping); + void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 23a46b6ed..0960888d1 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -46,7 +46,10 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst, char* src_ptr = static_cast(src.data_ptr()); char* dst_ptr = static_cast(dst.data_ptr()); - const int64_t block_size_in_bytes = src.element_size() * src[0].numel(); + // We use the stride instead of numel in case the cache is padded for memory + // alignment reasons, we assume the blocks data (inclusive of any padding) + // is contiguous in memory + const int64_t block_size_in_bytes = src.element_size() * src.stride(0); const at::cuda::OptionalCUDAGuard device_guard( src_device.is_cuda() ? src_device : dst_device); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); @@ -93,6 +96,24 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs, } } +// Kernel for MLA, which works on a single joint kv_cache +// Grid: (num_layers, num_pairs) +template +__global__ void copy_blocks_mla_kernel( + int64_t* cache_ptrs, const int64_t* __restrict__ block_mapping, + const int mem_footprint_per_block) { + const int layer_idx = blockIdx.x; + const int pair_idx = blockIdx.y; + scalar_t* cache = reinterpret_cast(cache_ptrs[layer_idx]); + int64_t src_block = block_mapping[2 * pair_idx]; + int64_t dst_block = block_mapping[2 * pair_idx + 1]; + int64_t src_offset = src_block * mem_footprint_per_block; + int64_t dst_offset = dst_block * mem_footprint_per_block; + for (int i = threadIdx.x; i < mem_footprint_per_block; i += blockDim.x) { + cache[dst_offset + i] = cache[src_offset + i]; + } +} + } // namespace vllm // Note: the key_caches and value_caches vectors are constant but @@ -147,6 +168,42 @@ void copy_blocks(std::vector const& key_caches, })); } +// copy blocks kernel for MLA (assumes a joint KV-cache) +void copy_blocks_mla(std::vector const& kv_caches, + const torch::Tensor& block_mapping) { + int num_layers = kv_caches.size(); + if (num_layers == 0) { + return; + } + torch::Device cache_device = kv_caches[0].device(); + TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA"); + + std::vector cache_ptrs(num_layers); + for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) { + cache_ptrs[layer_idx] = + reinterpret_cast(kv_caches[layer_idx].data_ptr()); + } + torch::Tensor cache_ptrs_tensor = + torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64) + .to(cache_device); + + int num_pairs = block_mapping.size(0); + // We use the stride instead of numel in case the cache is padded for memory + // alignment reasons, we assume the blocks data (inclusive of any padding) + // is contiguous in memory + int mem_footprint_per_block = kv_caches[0].stride(0); + dim3 grid(num_layers, num_pairs); + dim3 block(std::min(1024, mem_footprint_per_block)); + const at::cuda::OptionalCUDAGuard device_guard(cache_device); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES( + kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] { + vllm::copy_blocks_mla_kernel<<>>( + cache_ptrs_tensor.data_ptr(), + block_mapping.data_ptr(), mem_footprint_per_block); + })); +} + namespace vllm { template @@ -254,6 +311,7 @@ __global__ void concat_and_cache_mla_kernel( // + pe_dim)] const int64_t* __restrict__ slot_mapping, // [num_tokens] const int block_stride, // + const int entry_stride, // const int kv_c_stride, // const int k_pe_stride, // const int kv_lora_rank, // @@ -274,9 +332,8 @@ __global__ void concat_and_cache_mla_kernel( int src_stride, int dst_stride, int size, int offset) { for (int i = threadIdx.x; i < size; i += blockDim.x) { const int64_t src_idx = token_idx * src_stride + i; - const int64_t dst_idx = block_idx * block_stride + - block_offset * (kv_lora_rank + pe_dim) + i + - offset; + const int64_t dst_idx = + block_idx * block_stride + block_offset * entry_stride + i + offset; if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { dst[dst_idx] = src[src_idx]; } else { @@ -391,14 +448,14 @@ void reshape_and_cache_flash( // KV_T is the stored data type of kv-cache. // CACHE_T is the data type of key and value tensors. // KV_DTYPE is the real data type of kv-cache. -#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \ - vllm::concat_and_cache_mla_kernel \ - <<>>( \ - reinterpret_cast(kv_c.data_ptr()), \ - reinterpret_cast(k_pe.data_ptr()), \ - reinterpret_cast(kv_cache.data_ptr()), \ - slot_mapping.data_ptr(), block_stride, kv_c_stride, \ - k_pe_stride, kv_lora_rank, pe_dim, block_size, \ +#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \ + vllm::concat_and_cache_mla_kernel \ + <<>>( \ + reinterpret_cast(kv_c.data_ptr()), \ + reinterpret_cast(k_pe.data_ptr()), \ + reinterpret_cast(kv_cache.data_ptr()), \ + slot_mapping.data_ptr(), block_stride, entry_stride, \ + kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \ reinterpret_cast(scale.data_ptr())); void concat_and_cache_mla( @@ -428,6 +485,7 @@ void concat_and_cache_mla( int kv_c_stride = kv_c.stride(0); int k_pe_stride = k_pe.stride(0); int block_stride = kv_cache.stride(0); + int entry_stride = kv_cache.stride(1); dim3 grid(num_tokens); dim3 block(std::min(kv_lora_rank, 512)); diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 186e9c0e8..c03806f43 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -450,6 +450,10 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { "Tensor block_mapping) -> ()"); cache_ops.impl("copy_blocks", torch::kCUDA, ©_blocks); + cache_ops.def( + "copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()"); + cache_ops.impl("copy_blocks_mla", torch::kCUDA, ©_blocks_mla); + // Reshape the key and value tensors and cache them. cache_ops.def( "reshape_and_cache(Tensor key, Tensor value," diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 6f909b680..21c02c5de 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -9,6 +9,7 @@ import torch from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck from vllm import _custom_ops as ops from vllm.platforms import current_platform +from vllm.utils import align_to_256bytes COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] DTYPES = [torch.half, torch.bfloat16, torch.float] @@ -18,6 +19,13 @@ NUM_HEADS = [8] # Arbitrary values for testing HEAD_SIZES = [64, 80, 120, 256] BLOCK_SIZES = [8, 16, 32] +# Parameters for MLA tests. +KV_LORA_RANKS = [512] +QK_ROPE_HEAD_DIMS = [64] +NUM_TOKENS_MLA = [42] +BLOCK_SIZES_MLA = [16] +NUM_BLOCKS_MLA = [8] + # Arbitrary values for testing # don't make it too large. e.g. [1024, 36000] will OOM NUM_BLOCKS = [1024, 10000] @@ -432,3 +440,257 @@ def test_fp8_e4m3_conversion( ops.convert_fp8(converted_cache, cache_fp8) torch.testing.assert_close(cache, converted_cache, atol=0.001, rtol=0.1) + + +def _create_mla_cache( + num_blocks: int, + block_size: int, + entry_size: int, + dtype: torch.dtype, + kv_cache_dtype: str, + device: str, + align_cache: bool, +) -> torch.Tensor: + cache_dtype = torch.uint8 if kv_cache_dtype == "fp8" else dtype + + if align_cache: + alloc_entry_size = align_to_256bytes(entry_size, cache_dtype) + alloc_shape = (num_blocks, block_size, alloc_entry_size) + cache_full = torch.zeros(alloc_shape, dtype=cache_dtype, device=device) + cache = cache_full[..., :entry_size] + else: + cache = torch.zeros(num_blocks, + block_size, + entry_size, + dtype=cache_dtype, + device=device) + return cache + + +def _fill_mla_cache(cache: torch.Tensor, kv_cache_dtype: str): + rand_dtype = torch.float16 if kv_cache_dtype == "fp8" else cache.dtype + + vals = torch.randn(*cache.shape, device=cache.device, dtype=rand_dtype) + if kv_cache_dtype == "fp8": + temp = torch.zeros_like(cache) + ops.convert_fp8(temp, vals, 1.0, kv_dtype=kv_cache_dtype) + vals = temp + cache.copy_(vals) + + +@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS) +@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS) +@pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA) +@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA) +@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) +@pytest.mark.parametrize("align_cache", [False]) +@torch.inference_mode() +def test_concat_and_cache_mla( + kv_lora_rank: int, + qk_rope_head_dim: int, + num_tokens: int, + block_size: int, + num_blocks: int, + dtype: torch.dtype, + seed: int, + device: str, + kv_cache_dtype: str, + align_cache: bool, +) -> None: + current_platform.seed_everything(seed) + torch.set_default_device(device) + + total_slots = num_blocks * block_size + slot_mapping_lst = random.sample(range(total_slots), num_tokens) + slot_mapping = torch.tensor(slot_mapping_lst, + dtype=torch.long, + device=device) + + kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device) + k_pe = torch.randn(num_tokens, + qk_rope_head_dim, + dtype=dtype, + device=device) + entry_size = kv_lora_rank + qk_rope_head_dim + + scale = torch.tensor(0.1, dtype=torch.float32, device=device) + kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype, + kv_cache_dtype, device, align_cache) + ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device) + + for i in range(num_tokens): + slot = slot_mapping[i].item() + block_idx = slot // block_size + block_offset = slot % block_size + ref_temp[block_idx, block_offset, :kv_lora_rank] = kv_c[i] + ref_temp[block_idx, block_offset, kv_lora_rank:] = k_pe[i] + + if kv_cache_dtype == "fp8": + ref_kv_cache = torch.empty_like(ref_temp, dtype=kv_cache.dtype) + ops.convert_fp8(ref_kv_cache, + ref_temp, + scale.item(), + kv_dtype=kv_cache_dtype) + else: + ref_kv_cache = ref_temp + + opcheck( + torch.ops._C_cache_ops.concat_and_cache_mla, + (kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale), + test_utils=DEFAULT_OPCHECK_TEST_UTILS, + ) + + ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping, + kv_cache_dtype, scale) + + if kv_cache_dtype == "fp8": + result_temp = torch.empty_like(kv_cache, dtype=torch.float16) + ops.convert_fp8(result_temp, + kv_cache.contiguous(), + scale.item(), + kv_dtype=kv_cache_dtype) + expected_temp = torch.empty_like(ref_kv_cache, dtype=torch.float16) + ops.convert_fp8(expected_temp, + ref_kv_cache, + scale.item(), + kv_dtype=kv_cache_dtype) + torch.testing.assert_close(result_temp, + expected_temp, + atol=0.001, + rtol=0.1) + else: + torch.testing.assert_close(kv_cache, ref_kv_cache) + + +@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS) +@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS) +@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA) +@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA) +@pytest.mark.parametrize("num_layers", NUM_LAYERS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) +@pytest.mark.parametrize("align_cache", [False, True]) +@torch.inference_mode() +def test_copy_blocks_mla( + kv_lora_rank: int, + qk_rope_head_dim: int, + block_size: int, + num_blocks: int, + num_layers: int, + dtype: torch.dtype, + seed: int, + device: str, + kv_cache_dtype: str, + align_cache: bool, +) -> None: + current_platform.seed_everything(seed) + torch.set_default_device(device) + + entry_size = kv_lora_rank + qk_rope_head_dim + + kv_caches = [] + for _ in range(num_layers): + kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype, + kv_cache_dtype, device, align_cache) + _fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype) + kv_caches.append(kv_cache) + + ref_caches = [kv_cache.clone() for kv_cache in kv_caches] + + num_mappings = min(2, num_blocks // 2) + src_blocks = random.sample(range(num_blocks), num_mappings) + remaining = list(set(range(num_blocks)) - set(src_blocks)) + dst_blocks = random.sample(remaining, 2 * num_mappings) + block_mapping = [] + for i in range(num_mappings): + src = src_blocks[i] + dst1 = dst_blocks[2 * i] + dst2 = dst_blocks[2 * i + 1] + block_mapping.append((src, dst1)) + block_mapping.append((src, dst2)) + block_mapping_tensor = torch.tensor(block_mapping, + dtype=torch.int64, + device=device).view(-1, 2) + + for src, dst in block_mapping: + for ref_cache in ref_caches: + ref_cache[dst].copy_(ref_cache[src]) + + opcheck( + torch.ops._C_cache_ops.copy_blocks_mla, + (kv_caches, block_mapping_tensor), + test_utils=DEFAULT_OPCHECK_TEST_UTILS, + ) + ops.copy_blocks_mla(kv_caches, block_mapping_tensor) + + for kv_cache, ref_cache in zip(kv_caches, ref_caches): + torch.testing.assert_close(kv_cache, ref_cache) + + +@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS) +@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS) +@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA) +@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) +@pytest.mark.parametrize("align_cache", [False, True]) +@torch.inference_mode() +def test_swap_blocks_mla( + kv_lora_rank: int, + qk_rope_head_dim: int, + block_size: int, + num_blocks: int, + dtype: torch.dtype, + seed: int, + device: str, + kv_cache_dtype: str, + align_cache: bool, +) -> None: + current_platform.seed_everything(seed) + torch.set_default_device(device) + + entry_size = kv_lora_rank + qk_rope_head_dim + + src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype, + kv_cache_dtype, device, align_cache) + dst_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype, + kv_cache_dtype, device, align_cache) + + _fill_mla_cache(src_cache, kv_cache_dtype) + _fill_mla_cache(dst_cache, kv_cache_dtype) + + src_cache_clone = src_cache.clone() + + num_mappings = min(2, num_blocks // 2) + src_blocks = random.sample(range(num_blocks), num_mappings) + remaining_blocks = list(set(range(num_blocks)) - set(src_blocks)) + dst_blocks = random.sample(remaining_blocks, num_mappings) + block_mapping = list(zip(src_blocks, dst_blocks)) + block_mapping_tensor = torch.tensor(block_mapping, + dtype=torch.int64, + device="cpu").view(-1, 2) + + opcheck( + torch.ops._C_cache_ops.swap_blocks, + (src_cache, dst_cache, block_mapping_tensor), + test_utils=DEFAULT_OPCHECK_TEST_UTILS, + cond=(kv_lora_rank == KV_LORA_RANKS[0] + and qk_rope_head_dim == QK_ROPE_HEAD_DIMS[0]), + ) + + ops.swap_blocks(src_cache, dst_cache, block_mapping_tensor) + + for src, dst in block_mapping: + torch.testing.assert_close( + src_cache_clone[src].cpu(), + dst_cache[dst].cpu(), + msg=f"Block {src} from src should have been swapped to block " + f"{dst} in dst_cache.") diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index bdc9a6a33..a68235016 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1037,6 +1037,11 @@ def copy_blocks(key_caches: List[torch.Tensor], torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping) +def copy_blocks_mla(kv_caches: List[torch.Tensor], + block_mapping: torch.Tensor) -> None: + torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping) + + def swap_blocks(src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor) -> None: torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping) diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py index 20d7ef0fa..9a1984a93 100644 --- a/vllm/attention/backends/triton_mla.py +++ b/vllm/attention/backends/triton_mla.py @@ -26,7 +26,6 @@ from vllm.attention.backends.mla.utils import MLACommonImpl, MLACommonMetadata from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping, compute_slot_mapping_start_idx, is_block_tables_empty) -from vllm.attention.ops.paged_attn import PagedAttention from vllm.attention.ops.triton_decode_attention import decode_attention_fwd from vllm.utils import async_tensor_h2d, make_tensor_with_pad @@ -72,14 +71,14 @@ class TritonMLABackend(AttentionBackend): dst_kv_cache: torch.Tensor, src_to_dst: torch.Tensor, ) -> None: - PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) + ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) @staticmethod def copy_blocks( kv_caches: List[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: - PagedAttention.copy_blocks(kv_caches, src_to_dists) + ops.copy_blocks_mla(kv_caches, src_to_dists) @staticmethod def get_supported_head_sizes() -> List[int]: diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py index ec5ec4ce6..057fccb5e 100644 --- a/vllm/attention/ops/triton_decode_attention.py +++ b/vllm/attention/ops/triton_decode_attention.py @@ -204,10 +204,10 @@ def _decode_att_m_fwd( Req_to_tokens.stride(0), q.stride(0), q.stride(1), - k_buffer.stride(-2), - k_buffer.stride(-1), - v_buffer.stride(-2), - v_buffer.stride(-1), + k_buffer.stride(-3), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + k_buffer.stride(-2), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + v_buffer.stride(-3), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + v_buffer.stride(-2), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) att_out.stride(0), att_out.stride(1), att_out.stride(2), @@ -438,10 +438,10 @@ def _decode_grouped_att_m_fwd( Req_to_tokens.stride(0), q.stride(0), q.stride(1), - k_buffer.stride(-2), - k_buffer.stride(-1), - v_buffer.stride(-2), - v_buffer.stride(-1), + k_buffer.stride(-3), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + k_buffer.stride(-2), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + v_buffer.stride(-3), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + v_buffer.stride(-2), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) att_out.stride(0), att_out.stride(1), att_out.stride(2), diff --git a/vllm/envs.py b/vllm/envs.py index 5018f6deb..2c731eda7 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -82,6 +82,7 @@ if TYPE_CHECKING: VLLM_MLA_DISABLE: bool = False VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True VLLM_MLA_DISABLE_REQUANTIZATION: bool = False + VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False @@ -539,6 +540,15 @@ environment_variables: Dict[str, Callable[[], Any]] = { "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON": lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0")) ), + + # When on a Nvidia GPU aligns single entries (within a page) so they are 256 + # byte aligned for better performance, this increases the memory usage of + # the cache. Currently this only affects MLA that results in non-256 + # byte aligned entries. This matches the alignment the CUDA runtime uses + # for all allocations. Currently this primarily affects MLA, for most other + # models the alignment is already naturally aligned to 256 bytes. + "VLLM_CUDA_MEM_ALIGN_KV_CACHE": + lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))), } # end-env-vars-definition diff --git a/vllm/utils.py b/vllm/utils.py index a2b53fcf2..8b9269598 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -563,6 +563,10 @@ def cdiv(a: int, b: int) -> int: return -(a // -b) +def round_up(x: int, y: int) -> int: + return ((x + y - 1) // y) * y + + def _generate_random_fp8( tensor: torch.Tensor, low: float, @@ -794,6 +798,12 @@ def get_dtype_size(dtype: torch.dtype) -> int: return torch.tensor([], dtype=dtype).element_size() +def align_to_256bytes(extent: int, dtype: torch.dtype) -> int: + dtype_size = get_dtype_size(dtype) + eles_per_256bytes = 256 // dtype_size + return round_up(extent, eles_per_256bytes) + + # `collections` helpers def is_list_of( value: object, diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 252fe0660..3960392cf 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -2,13 +2,17 @@ """CacheEngine class for managing the KV cache.""" from typing import List +import numpy as np import torch +from vllm import envs from vllm.attention import get_attn_backend from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, - get_dtype_size, is_pin_memory_available) + align_to_256bytes, get_dtype_size, + is_pin_memory_available) logger = init_logger(__name__) @@ -38,6 +42,7 @@ class CacheEngine: self.num_attention_layers = model_config.get_num_layers_by_block_type( parallel_config, LayerBlockType.attention) self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) + self.align_cache = self._align_cache(model_config) self.block_size = cache_config.block_size self.num_gpu_blocks = cache_config.num_gpu_blocks @@ -75,15 +80,39 @@ class CacheEngine: num_blocks, self.block_size, self.num_kv_heads, self.head_size) pin_memory = is_pin_memory_available() if device == "cpu" else False kv_cache: List[torch.Tensor] = [] + + # Align entries so they are 256 byte aligned for better performance + # Primarily targets MLA as this typically only ends up having entries + # be 128 byte aligned. + if self.align_cache: + # We assume the cache shape is: + # (TOTAL_PAGES, PAGE_SIZE, entry_shape...) + # NOTE this assumption currently only holds for MLA so we only apply + # this optimization when `use_mla` is true + entry_shape = kv_cache_shape[2:] + entry_size = np.prod(entry_shape) + alloc_entry_size = align_to_256bytes(entry_size, self.dtype) + alloc_shape = (*kv_cache_shape[:2], alloc_entry_size) + else: + alloc_shape = kv_cache_shape + for _ in range(self.num_attention_layers): # null block in CpuGpuBlockAllocator requires at least that # block to be zeroed-out. # We zero-out everything for simplicity. - kv_cache.append( - torch.zeros(kv_cache_shape, - dtype=self.dtype, - pin_memory=pin_memory, - device=device)) + layer_kv_cache = torch.zeros(alloc_shape, + dtype=self.dtype, + pin_memory=pin_memory, + device=device) + + # If we allocated with padding for alignment reasons truncate the + # shape while preserving the aligned stride + if self.align_cache: + layer_kv_cache = layer_kv_cache[..., :entry_size] + + # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases + # when entry_shape is higher than 1D + kv_cache.append(layer_kv_cache.view(kv_cache_shape)) return kv_cache def swap_in(self, src_to_dst: torch.Tensor) -> None: @@ -99,6 +128,14 @@ class CacheEngine: def copy(self, src_to_dsts: torch.Tensor) -> None: self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts) + @staticmethod + def _align_cache(model_config: ModelConfig): + # Currently align_cache only applies to MLA models since the other + # cache kernels haven't been updated yet to support non-continguous + # tensors + return model_config.use_mla and current_platform.is_cuda() \ + and envs.VLLM_CUDA_MEM_ALIGN_KV_CACHE + @staticmethod def get_cache_block_size( cache_config: CacheConfig, @@ -110,14 +147,21 @@ class CacheEngine: num_attention_layers = model_config.get_num_layers_by_block_type( parallel_config, LayerBlockType.attention) - key_cache_block = cache_config.block_size * num_heads * head_size - # For MLA there is no value cache, since the latent vector - # is joint keys and values. - value_cache_block = key_cache_block if not model_config.use_mla else 0 - total = num_attention_layers * (key_cache_block + value_cache_block) if cache_config.cache_dtype == "auto": dtype = model_config.dtype else: dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] + + key_cache_entry = num_heads * head_size + if CacheEngine._align_cache(model_config): + key_cache_entry = align_to_256bytes(key_cache_entry, + model_config.dtype) + + # For MLA there is no value cache, since the latent vector + # is joint keys and values. + value_cache_entry = key_cache_entry if not model_config.use_mla else 0 + total = num_attention_layers * cache_config.block_size * \ + (key_cache_entry + value_cache_entry) + dtype_size = get_dtype_size(dtype) return dtype_size * total -- GitLab From b3a0d01e4551118c735ee905c4ddc800ec603f24 Mon Sep 17 00:00:00 2001 From: Aviv Keshet Date: Tue, 4 Feb 2025 18:46:26 -0800 Subject: [PATCH 39/65] [Core] add and implement `VLLM_LOGITS_PROCESSOR_THREADS` (#12368) Signed-off-by: Aviv Keshet --- vllm/envs.py | 9 ++++ .../model_executor/layers/logits_processor.py | 46 ++++++++++++++----- 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 2c731eda7..bb419dacb 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -31,6 +31,7 @@ if TYPE_CHECKING: VLLM_LOGGING_LEVEL: str = "INFO" VLLM_LOGGING_PREFIX: str = "" VLLM_LOGGING_CONFIG_PATH: Optional[str] = None + VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None VLLM_TRACE_FUNCTION: int = 0 VLLM_ATTENTION_BACKEND: Optional[str] = None VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None @@ -282,6 +283,14 @@ environment_variables: Dict[str, Callable[[], Any]] = { "VLLM_LOGGING_PREFIX": lambda: os.getenv("VLLM_LOGGING_PREFIX", ""), + # if set, vllm will call logits processors in a thread pool with this many + # threads. This is useful when using custom logits processors that either + # (a) launch additional CUDA kernels or (b) do significant CPU-bound work + # while not holding the python GIL, or both. + "VLLM_LOGITS_PROCESSOR_THREADS": + lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0")) + if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None, + # Trace function calls # If set to 1, vllm will trace function calls # Useful for debugging diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index ebf74c67d..cdc67ca83 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """A layer that compute logits from hidden_stats.""" import inspect +from concurrent.futures import ThreadPoolExecutor from typing import Optional import torch @@ -15,6 +16,11 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.platforms import current_platform +_logits_processor_threadpool: Optional[ThreadPoolExecutor] = None +if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None: + _logits_processor_threadpool = ThreadPoolExecutor( + envs.VLLM_LOGITS_PROCESSOR_THREADS) + class LogitsProcessor(nn.Module): """Process logits and apply logits processors from sampling metadata. @@ -135,6 +141,7 @@ def _apply_logits_processors( ) -> torch.Tensor: found_logits_processors = False logits_processed = 0 + logits_row_ids_and_logits_row_futures = [] for seq_group in sampling_metadata.seq_groups: seq_ids = seq_group.seq_ids sampling_params = seq_group.sampling_params @@ -148,22 +155,39 @@ def _apply_logits_processors( past_tokens_ids = seq_group.seq_data[seq_id].output_token_ids prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids - for logits_processor in logits_processors: - parameters = inspect.signature(logits_processor).parameters - if len(parameters) == 3: - logits_row = logits_processor(prompt_tokens_ids, - past_tokens_ids, - logits_row) - else: - logits_row = logits_processor(past_tokens_ids, - logits_row) - - logits[logits_row_idx] = logits_row + if _logits_processor_threadpool is not None: + logits_row_ids_and_logits_row_futures.append( + (logits_row_idx, + _logits_processor_threadpool.submit( + _apply_logits_processors_single_seq, logits_row, + logits_processors, past_tokens_ids, + prompt_tokens_ids))) + else: + logits[logits_row_idx] = \ + _apply_logits_processors_single_seq( + logits_row, logits_processors, past_tokens_ids, + prompt_tokens_ids) logits_processed += len(seq_group.sample_indices) + len( seq_group.prompt_logprob_indices) + for logits_row_idx, future in logits_row_ids_and_logits_row_futures: + logits[logits_row_idx] = future.result() + if found_logits_processors: # verifies that no rows in logits were missed unexpectedly assert logits_processed == logits.shape[0] return logits + + +def _apply_logits_processors_single_seq(logits_row, logits_processors, + past_tokens_ids, + prompt_tokens_ids) -> torch.Tensor: + for logits_processor in logits_processors: + parameters = inspect.signature(logits_processor).parameters + if len(parameters) == 3: + logits_row = logits_processor(prompt_tokens_ids, past_tokens_ids, + logits_row) + else: + logits_row = logits_processor(past_tokens_ids, logits_row) + return logits_row -- GitLab From 64862d106efa78032702f5fa5c110ccd6d654e9a Mon Sep 17 00:00:00 2001 From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Date: Tue, 4 Feb 2025 19:58:22 -0800 Subject: [PATCH 40/65] [ROCM][AMD][TRITON] Halving warps number for fw_prefill to reduce spilling (#12713) Signed-off-by: Aleksandr Malyshev Co-authored-by: Aleksandr Malyshev --- vllm/attention/ops/prefix_prefill.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index fbb6757ee..5fca16393 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -11,7 +11,7 @@ from vllm.platforms import current_platform # Static kernels parameters BASE_BLOCK = 128 if current_platform.has_device_capability(80) else 64 -NUM_WARPS = 8 +NUM_WARPS = 4 if current_platform.is_rocm() else 8 # To check compatibility IS_TURING = current_platform.get_device_capability() == (7, 5) -- GitLab From 249824c3bfef6b9c03dd087569ef1e1072b2a4b0 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 5 Feb 2025 04:31:12 +0000 Subject: [PATCH 41/65] Refactor `Linear` handling in `TransformersModel` (#12727) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/layers/linear.py | 30 ++++----- vllm/model_executor/models/transformers.py | 76 ++++++++++------------ 2 files changed, 48 insertions(+), 58 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 08f1e103e..da8db08fe 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -2,7 +2,7 @@ import itertools from abc import abstractmethod -from typing import Dict, List, Optional, Tuple +from typing import Optional import torch import torch.nn.functional as F @@ -47,8 +47,8 @@ def adjust_marlin_shard(param, shard_size, shard_offset): def adjust_bitsandbytes_4bit_shard(param: Parameter, - shard_offsets: Dict[str, Tuple[int, int]], - loaded_shard_id: str) -> Tuple[int, int]: + shard_offsets: dict[str, tuple[int, int]], + loaded_shard_id: str) -> tuple[int, int]: """Adjust the quantization offsets and sizes for BitsAndBytes sharding.""" total, _ = shard_offsets["total"] @@ -90,7 +90,7 @@ class LinearMethodBase(QuantizeMethodBase): @abstractmethod def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): """Create weights for a linear layer. @@ -123,7 +123,7 @@ class UnquantizedLinearMethod(LinearMethodBase): def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): weight = Parameter(torch.empty(sum(output_partition_sizes), @@ -179,7 +179,8 @@ class LinearBase(torch.nn.Module): self.quant_method = quant_config.get_quant_method(self, prefix=prefix) - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward(self, + x: torch.Tensor) -> tuple[torch.Tensor, Optional[Parameter]]: raise NotImplementedError @@ -240,9 +241,8 @@ class ReplicatedLinear(LinearBase): assert param.size() == loaded_weight.size() param.data.copy_(loaded_weight) - def forward( - self, x: torch.Tensor - ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: + def forward(self, + x: torch.Tensor) -> tuple[torch.Tensor, Optional[Parameter]]: bias = self.bias if not self.skip_bias_add else None assert self.quant_method is not None output = self.quant_method.apply(self, x, bias) @@ -288,7 +288,7 @@ class ColumnParallelLinear(LinearBase): skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, quant_config: Optional[QuantizationConfig] = None, - output_sizes: Optional[List[int]] = None, + output_sizes: Optional[list[int]] = None, prefix: str = ""): super().__init__(input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix) @@ -374,7 +374,7 @@ class ColumnParallelLinear(LinearBase): loaded_weight = loaded_weight.reshape(1) param.load_column_parallel_weight(loaded_weight=loaded_weight) - def forward(self, input_): + def forward(self, input_) -> tuple[torch.Tensor, Optional[Parameter]]: bias = self.bias if not self.skip_bias_add else None # Matrix multiply. @@ -422,7 +422,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): def __init__(self, input_size: int, - output_sizes: List[int], + output_sizes: list[int], bias: bool = True, gather_output: bool = False, skip_bias_add: bool = False, @@ -500,7 +500,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): current_shard_offset = 0 use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) - shard_offsets: List[Tuple[int, int, int]] = [] + shard_offsets: list[tuple[int, int, int]] = [] for i, output_size in enumerate(self.output_sizes): shard_offsets.append((i, current_shard_offset, output_size)) current_shard_offset += output_size @@ -602,7 +602,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): """ current_shard_offset = 0 - shard_offsets: List[Tuple[int, int, int]] = [] + shard_offsets: list[tuple[int, int, int]] = [] for i, output_size in enumerate(self.output_sizes): shard_offsets.append((i, current_shard_offset, output_size)) current_shard_offset += output_size @@ -1124,7 +1124,7 @@ class RowParallelLinear(LinearBase): param.load_row_parallel_weight(loaded_weight=loaded_weight) - def forward(self, input_): + def forward(self, input_) -> tuple[torch.Tensor, Optional[Parameter]]: if self.input_is_parallel: input_parallel = input_ else: diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 160beaa14..dfc714382 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -14,7 +15,7 @@ # limitations under the License. """Wrapper around `transformers` models""" import re -from typing import Iterable, List, Optional, Set, Tuple, Union +from typing import Iterable, Optional, Union import torch from torch import nn @@ -71,23 +72,10 @@ def vllm_flash_attention_forward( ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward -# Linear Layer that is compatible with transformers internal forward -# TODO: This is a temporary solution, we should find a better way to integrate -class HFColumnParallelLinear(ColumnParallelLinear): - - def forward(self, input: torch.Tensor) -> torch.Tensor: - return super().forward(input)[0] - - -class HFRowParallelLinear(RowParallelLinear): - - def forward(self, input: torch.Tensor) -> torch.Tensor: - return super().forward(input)[0] - - -def replace_tp_linear_class(orig_module: nn.Linear, - style: str, - quant_config=None): +def replace_linear_class( + linear: nn.Linear, + style: str, + quant_config=None) -> Union[ColumnParallelLinear, RowParallelLinear]: """ In model configurations, we use a neutral type (string) to specify parallel styles, here we use it to translate nn.Linear into vllm-style tp Linear. @@ -99,26 +87,28 @@ def replace_tp_linear_class(orig_module: nn.Linear, raise ValueError( f"Unsupported parallel style type {type(style)}, expected str") - input_size = orig_module.in_features - output_size = orig_module.out_features - bias = orig_module.bias is not None + vllm_linear_cls = { + "colwise": ColumnParallelLinear, + "rowwise": RowParallelLinear, + }.get(style) - if style == "colwise": - return HFColumnParallelLinear( - input_size, - output_size, - bias, - ) - elif style == "rowwise": - return HFRowParallelLinear( - input_size, - output_size, - bias, - ) - # We don't consider colwise_rep since it's used in lm_head - else: + if vllm_linear_cls is None: raise ValueError(f"Unsupported parallel style value: {style}") + class HFCompatibleLinear(vllm_linear_cls): + """ + Wrapper class that removes `output_bias` from returned output. + """ + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return super().forward(input)[0] + + return HFCompatibleLinear( + input_size=linear.in_features, + output_size=linear.out_features, + bias=linear.bias is not None, + ) + class TransformersModel(nn.Module): embedding_padding_modules = ["lm_head"] @@ -192,16 +182,16 @@ class TransformersModel(nn.Module): "support it yet!") for child_name, child_module in module.named_children(): - qual_name = prefix + child_name + qual_name = maybe_prefix(prefix, child_name) for pattern, style in self.config.base_model_tp_plan.items(): if re.match(pattern, qual_name) and isinstance( child_module, nn.Linear): - new_module = replace_tp_linear_class( - child_module, style, self.quant_config) + new_module = replace_linear_class(child_module, style, + self.quant_config) setattr(module, child_name, new_module) self.log_replacement(qual_name, child_module, new_module) else: - self.tensor_parallelize(child_module, prefix=f"{qual_name}.") + self.tensor_parallelize(child_module, prefix=qual_name) def replace_vocab_embed_class(self, module: nn.Module): # Use native set input embeddings @@ -219,7 +209,7 @@ class TransformersModel(nn.Module): self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: List[torch.Tensor], # argument not used + kv_caches: list[torch.Tensor], # argument not used attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -249,10 +239,10 @@ class TransformersModel(nn.Module): next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params = set[str]() for name, loaded_weight in weights: if name not in params_dict: name = f"{self.model.base_model_prefix}.{name}" -- GitLab From 98fd089fc974313ac13370f79f4c02a3839baf4a Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 5 Feb 2025 12:44:26 +0800 Subject: [PATCH 42/65] [VLM] Add MLA with pure RoPE support for deepseek-vl2 models (#12729) --- vllm/attention/backends/mla/utils.py | 30 ++++++++++++++++++++--- vllm/model_executor/models/deepseek_v2.py | 3 ++- vllm/model_executor/models/deepseek_v3.py | 3 ++- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py index 8e584cca3..cd8c08e5a 100644 --- a/vllm/attention/backends/mla/utils.py +++ b/vllm/attention/backends/mla/utils.py @@ -26,7 +26,8 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8) from vllm.model_executor.layers.quantization.utils.quant_utils import ( scaled_dequantize, scaled_quantize) -from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding +from vllm.model_executor.layers.rotary_embedding import ( + DeepseekScalingRotaryEmbedding, RotaryEmbedding) try: from vllm.vllm_flash_attn import flash_attn_varlen_func @@ -174,6 +175,8 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): self.v_head_dim = v_head_dim self.rotary_emb = rotary_emb + self.use_yarn_rope = isinstance(rotary_emb, + DeepseekScalingRotaryEmbedding) self.q_proj = q_proj self.kv_b_proj = kv_b_proj self.o_proj = o_proj @@ -420,6 +423,24 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): ) -> torch.Tensor: raise NotImplementedError + def apply_pure_rope( + self, + input_positions: torch.Tensor, + q_pe: torch.Tensor, + k_pe: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + seq_len = input_positions.size(0) + ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape + + q_pe, k_pe = self.rotary_emb( + input_positions, + q_pe.reshape(seq_len, -1), + k_pe.reshape(seq_len, -1), + ) + q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape) + + return q_pe, k_pe + def forward( self, layer: AttentionLayer, @@ -444,13 +465,14 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): # Restore head dim (for rotary embedding) k_pe = k_pe.unsqueeze(1) assert hasattr(attn_metadata, "input_positions") + rope_fn = (self.rotary_emb + if self.use_yarn_rope else self.apply_pure_rope) if is_decode: q_nope = self._q_proj_and_k_up_proj(hidden_states_or_q_c) q_pe = torch.matmul(hidden_states_or_q_c, self.W_QR)\ .view(-1, self.num_heads, self.qk_rope_head_dim) - q_pe, k_pe = \ - self.rotary_emb(attn_metadata.input_positions, q_pe, k_pe) + q_pe, k_pe = rope_fn(attn_metadata.input_positions, q_pe, k_pe) else: assert is_prefill q = self.q_proj(hidden_states_or_q_c)[0]\ @@ -458,7 +480,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): # TODO(lucas): there must be a nicer way to write this line q[..., self.qk_nope_head_dim:], k_pe = \ - self.rotary_emb( + rope_fn( attn_metadata.input_positions, q[..., self.qk_nope_head_dim:], k_pe) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index f5fede4d8..fdd584f9d 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -414,7 +414,8 @@ class DeepseekV2MLAAttention(nn.Module): quant_config=quant_config, prefix=f"{prefix}.o_proj") - rope_scaling["rope_type"] = 'deepseek_yarn' + if rope_scaling: + rope_scaling["rope_type"] = 'deepseek_yarn' self.rotary_emb = get_rope(qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py index a4829aa1a..81f82b182 100644 --- a/vllm/model_executor/models/deepseek_v3.py +++ b/vllm/model_executor/models/deepseek_v3.py @@ -422,7 +422,8 @@ class DeepseekV3MLAAttention(nn.Module): quant_config=quant_config, prefix=f"{prefix}.o_proj") - rope_scaling["rope_type"] = 'deepseek_yarn' + if rope_scaling: + rope_scaling["rope_type"] = 'deepseek_yarn' self.rotary_emb = get_rope(qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, -- GitLab From 686006a22020c80fcaab2d12064302505188f577 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 4 Feb 2025 23:44:48 -0500 Subject: [PATCH 43/65] [Misc] Bump the compressed-tensors version (#12736) --- requirements-common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-common.txt b/requirements-common.txt index 97e33a6db..cfa020256 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -34,6 +34,6 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.9.0 # required for compressed-tensors +compressed-tensors == 0.9.1 # required for compressed-tensors depyf==0.18.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py -- GitLab From 7ff7a638b66fdeca71257f245369bd7337e55d32 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 5 Feb 2025 00:32:06 -0500 Subject: [PATCH 44/65] [Model][Quant] Fix GLM, Fix fused module mappings for quantization (#12634) Signed-off-by: mgoin Signed-off-by: Kyle Sayers Co-authored-by: mgoin --- .../layers/quantization/base_config.py | 3 +- .../compressed_tensors/compressed_tensors.py | 37 +++-- .../quantization/compressed_tensors/utils.py | 140 +++++++----------- .../layers/quantization/quark/quark.py | 10 +- .../layers/quantization/quark/utils.py | 17 ++- .../layers/quantization/utils/quant_utils.py | 26 ++-- vllm/model_executor/model_loader/loader.py | 4 + vllm/model_executor/model_loader/utils.py | 22 +++ vllm/model_executor/models/chatglm.py | 26 +++- .../models/glm4_vision_encoder.py | 31 ++-- vllm/model_executor/models/minicpmv.py | 14 +- vllm/model_executor/models/qwen.py | 16 +- 12 files changed, 195 insertions(+), 151 deletions(-) diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 2eefcc4f3..c0d8553c0 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -2,7 +2,7 @@ import inspect from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional, Type +from typing import Any, Dict, List, Mapping, Optional, Type import torch from torch import nn @@ -59,6 +59,7 @@ def method_has_implemented_embedding( class QuantizationConfig(ABC): """Base class for quantization configs.""" + packed_modules_mapping: Mapping[str, List[str]] = dict() @abstractmethod def get_name(self) -> str: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 1a11b2419..0e3258e4a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -83,7 +83,9 @@ class CompressedTensorsConfig(QuantizationConfig): # Check if the layer is skipped for quantization. # TODO (@robertgshaw2): support module names - if should_ignore_layer(prefix, ignore=self.ignore): + if should_ignore_layer(prefix, + ignore=self.ignore, + fused_mapping=self.packed_modules_mapping): return UnquantizedLinearMethod() if isinstance(layer, LinearBase): scheme = self.get_scheme(layer=layer, layer_name=prefix) @@ -379,34 +381,29 @@ class CompressedTensorsConfig(QuantizationConfig): # Will be empty for models with only sparsity weight_quant = input_quant = None - sparsity_scheme: Optional[SparsityCompressionConfig] = None if self.target_scheme_map: matched_target = find_matched_target( layer_name=layer_name, module=layer, - targets=self.target_scheme_map.keys()) + targets=self.target_scheme_map.keys(), + fused_mapping=self.packed_modules_mapping) scheme_dict = self.target_scheme_map[matched_target] weight_quant = scheme_dict.get("weights") input_quant = scheme_dict.get("input_activations") - if self.sparsity_scheme_map: - is_ignored = False - with suppress(ValueError): - is_ignored = find_matched_target( - layer_name=layer_name, - module=layer, - targets=self.sparsity_ignore_list) - - # if the layer is in the sparsity ignore list, - # we should not apply any sparsity scheme - - if not is_ignored: - matched_target = find_matched_target( - layer_name=layer_name, - module=layer, - targets=self.sparsity_scheme_map.keys()) - sparsity_scheme = self.sparsity_scheme_map.get(matched_target) + # Find the sparsity scheme of the layer + # assume that fused layers inerhit first component's sparsity scheme + sparsity_targets = (self.sparsity_scheme_map.keys() - + set(self.sparsity_ignore_list)) + sparsity_scheme: Optional[SparsityCompressionConfig] = None + with suppress(ValueError): + matched_target = find_matched_target( + layer_name=layer_name, + module=layer, + targets=sparsity_targets, + fused_mapping=self.packed_modules_mapping) + sparsity_scheme = self.sparsity_scheme_map[matched_target] if self.supports_cutlass_24(weight_quant=weight_quant, input_quant=input_quant, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 4ea79531e..85ae1d5cb 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -1,14 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 import re -from typing import Iterable, Optional +from types import MappingProxyType +from typing import Iterable, List, Mapping, Optional from compressed_tensors import CompressionFormat from torch.nn import Module -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - FUSED_LAYER_NAME_MAPPING) - def is_activation_quantization_format(format: str) -> bool: _ACTIVATION_QUANTIZATION_FORMATS = [ @@ -19,8 +17,11 @@ def is_activation_quantization_format(format: str) -> bool: return format in _ACTIVATION_QUANTIZATION_FORMATS -def should_ignore_layer(layer_name: Optional[str], - ignore: Iterable[str]) -> bool: +def should_ignore_layer( + layer_name: Optional[str], + ignore: Iterable[str] = tuple(), + fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) +) -> bool: if layer_name is None: return False @@ -32,8 +33,8 @@ def should_ignore_layer(layer_name: Optional[str], # in the safetensors checkpoint. So, we convert the name # from the fused version to unfused + check to make sure that # each shard of the fused layer has the same scheme. - if proj_name in FUSED_LAYER_NAME_MAPPING and layer_name not in ignore: - shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name] + if proj_name in fused_mapping and layer_name not in ignore: + shard_proj_names = fused_mapping[proj_name] # Convert fused_name --> [shard_names] shard_names = [ @@ -79,55 +80,12 @@ def check_equal_or_regex_match(layer_name: str, return False -def _handle_fused_layers(func): - """ - Decorator to handle fused layers by mapping vllm fused layer names - to their corresponding unfused layer names for quantization/pruning schemes. - """ - # fused_layer_name -> unfused_layer_name - fused_layer_map = { - "qkv_proj": "q_proj", - "gate_up_proj": "up_proj", - } - - def fused_layer_handler(layer_name: Optional[str], module: Module, - targets: Iterable[str]) -> Optional[str]: - """ - Wrapper function specifically designed to support the - find_matched_target function. - - It handles cases where the provided layer name corresponds to a - fused layer in vllm, mapping it to its equivalent unfused layer name - based on the predefined fused_layer_map. If the original layer name - raises a ValueError in the wrapped function, this handler - will attempt to resolve the issue by substituting with unfused - layer name. - - :param layer_name: Name of the layer, which may be fused. - :param module: An instance of torch.nn.Module. - :param targets: A list of target names or patterns to match. - :return: The result of the wrapped find_matched_target function with - the resolved layer name. - :raises ValueError: If the layer name cannot be resolved to a - valid target. - """ - try: - return func(layer_name, module, targets) - except ValueError: - if layer_name is None: - layer_name = "" - parent_name, fused_proj_name = layer_name.rsplit(".", 1) - unfused_proj_name = fused_layer_map.get(fused_proj_name, - fused_proj_name) - new_layer_name = f"{parent_name}.{unfused_proj_name}" - return func(new_layer_name, module, targets) - - return fused_layer_handler - - -@_handle_fused_layers -def find_matched_target(layer_name: Optional[str], module: Module, - targets: Iterable[str]) -> str: +def find_matched_target( + layer_name: Optional[str], + module: Module, + targets: Iterable[str], + fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) +) -> str: """ Helper function to look up which "target" in the compressed-tensors config that a layer corresponds to. @@ -141,19 +99,25 @@ def find_matched_target(layer_name: Optional[str], module: Module, First, we try to match the layer_name with a target Second, we try to match the module's name with a target + Third, we try to map the layer_name to a list of fused module names. + *All* component module names must match in order for a match to be + successful. A successful match returns the first component target :param layer_name: layer name :param module: torch.nn.Module :param targets: list of targets to match the layer against + :param fused_mapping: map from fused layer names to its components + :param fused_strategy: either "all" or "any". If using "all", fused + layers match if "all" of its components match """ if layer_name is None: layer_name = "" - matched_target = (_find_first_match(layer_name, targets) - or _find_first_match(module.__class__.__name__, targets, - True) - or _match_fused_layer(layer_name, targets)) + matched_target = ( + _find_first_match(layer_name, targets) + or _find_first_match(module.__class__.__name__, targets, True) + or _match_fused_layer(layer_name, targets, fused_mapping)) if matched_target is None: raise ValueError( @@ -205,11 +169,19 @@ def _is_equal_or_regex_match(value: str, return False -def _match_fused_layer(layer_name: str, - target_layers: Iterable[str]) -> Optional[str]: +def _match_fused_layer( + layer_name: str, target_layers: Iterable[str], + fused_mapping: Mapping[str, List[str]]) -> Optional[str]: """ Match a fused layer name to its corresponding individual layer in - target_layers. + target_layers. Returns first value in fused_mapping which matches targets + + Implements an "all" matching strategy where a fused layer matches iff + "all" of its components match + + :param layer_name: layer name + :param target_layers: list of targets to match the layer against + :param fused_mapping: map from fused layer names to its components Examples: layer_name = "model.layers.0.self_attn.qkv_proj" @@ -217,27 +189,25 @@ def _match_fused_layer(layer_name: str, "model.layers.0.self_attn.k_proj", "model.layers.0.self_attn.v_proj"] """ - # Split into parent path and layer type - # e.g., "model.layers.0.self_attn" and "qkv_proj" - parent_path = ".".join(layer_name.split(".")[:-1]) - layer_type = layer_name.split(".")[-1] - - if layer_type not in FUSED_LAYER_NAME_MAPPING: + # find layer_name in mapping + fused = next((key for key in fused_mapping if layer_name.endswith(key)), + None) + if fused is None: return None - possible_layer_types = FUSED_LAYER_NAME_MAPPING[layer_type] - - # Look for a target layer that: - # 1. Has the same parent path - # 2. Ends with one of the possible individual layer types - for target in target_layers: - is_same_parent = parent_path in target - is_matching_type = any(type_suffix in target - for type_suffix in possible_layer_types) - - if is_same_parent and is_matching_type and all( - (f"{parent_path}.{type_suffix}" in target_layers) - for type_suffix in possible_layer_types): - return target + # expand path of unfused components + unfused_paths = [ + layer_name.replace(fused, unfused) for unfused in fused_mapping[fused] + ] - return None + # for each unfused component, find a match in targets + unfused_matches: List[Optional[str]] = [] + for unfused in unfused_paths: + for target in target_layers: + if _is_equal_or_regex_match(unfused, target): + unfused_matches.append(target) + break + else: + unfused_matches.append(None) + + return unfused_matches[0] if all(unfused_matches) else None diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index 0451cf82b..ba123565a 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -18,8 +18,6 @@ from vllm.model_executor.layers.quantization.quark.schemes import ( QuarkScheme, QuarkW8A8Fp8, QuarkW8A8Int8) from vllm.model_executor.layers.quantization.quark.utils import ( deep_compare, should_ignore_layer) -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - FUSED_LAYER_NAME_MAPPING) from vllm.platforms import current_platform __all__ = ["QuarkLinearMethod"] @@ -58,7 +56,9 @@ class QuarkConfig(QuantizationConfig): # Check if the layer is skipped for quantization. exclude_layers = cast(List[str], self.quant_config.get("exclude")) - if should_ignore_layer(prefix, ignore=exclude_layers): + if should_ignore_layer(prefix, + ignore=exclude_layers, + fused_mapping=self.packed_modules_mapping): return UnquantizedLinearMethod() if isinstance(layer, LinearBase): scheme = self.get_scheme(layer=layer, layer_name=prefix) @@ -201,8 +201,8 @@ class QuarkConfig(QuantizationConfig): module: torch.nn.Module) -> Dict[str, Any]: proj_name = layer_name.split(".")[-1] - if proj_name in FUSED_LAYER_NAME_MAPPING: - shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name] + if proj_name in self.packed_modules_mapping: + shard_proj_names = self.packed_modules_mapping[proj_name] # Convert fused_name --> [shard_names] shard_names = [ diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py index afb1d9d63..17e0df021 100644 --- a/vllm/model_executor/layers/quantization/quark/utils.py +++ b/vllm/model_executor/layers/quantization/quark/utils.py @@ -1,10 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import re -from typing import Any, Iterable, Optional - -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - FUSED_LAYER_NAME_MAPPING) +from types import MappingProxyType +from typing import Any, Iterable, List, Mapping, Optional def deep_compare(dict1: Any, dict2: Any) -> bool: @@ -20,8 +18,11 @@ def deep_compare(dict1: Any, dict2: Any) -> bool: return dict1 == dict2 -def should_ignore_layer(layer_name: Optional[str], - ignore: Iterable[str]) -> bool: +def should_ignore_layer( + layer_name: Optional[str], + ignore: Iterable[str], + fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) +) -> bool: if layer_name is None: return False @@ -33,8 +34,8 @@ def should_ignore_layer(layer_name: Optional[str], # in the safetensors checkpoint. So, we convert the name # from the fused version to unfused + check to make sure that # each shard of the fused layer has the same scheme. - if proj_name in FUSED_LAYER_NAME_MAPPING: - shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name] + if proj_name in fused_mapping: + shard_proj_names = fused_mapping[proj_name] # Convert fused_name --> [shard_names] shard_names = [ diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index 62484f62f..c7ce3a42c 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """This file is used for /tests and /benchmarks""" -from typing import List, Optional, Tuple +from types import MappingProxyType +from typing import List, Mapping, Optional, Tuple import numpy import torch @@ -12,14 +13,6 @@ from vllm.scalar_type import ScalarType, scalar_types SUPPORTED_GPTQ_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128] SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128] -# Note: this is a hack. We should update each model to register the -# stacked params and get it from there instead in a future PR. -# fused_name: List[shard_name] -FUSED_LAYER_NAME_MAPPING = { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - "gate_up_proj": ["gate_proj", "up_proj"] -} - # Normalize the group_shape to the full extent for any dims that are -1 def _normalize_quant_group_shape(x: torch.Tensor, group_shape: Tuple[int, @@ -178,14 +171,23 @@ def unpack_quantized_values_into_int32(w_q: torch.Tensor, return res.permute(inv_perm) -def is_layer_skipped(prefix: str, ignored_layers: List[str]) -> bool: +def is_layer_skipped( + prefix: str, + ignored_layers: List[str], + fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) +) -> bool: # prefix: model.layers.0.self_attn.q_proj # proj_name: q_proj proj_name = prefix.split(".")[-1] - if proj_name in FUSED_LAYER_NAME_MAPPING: + + # Fused layers like gate_up_proj or qkv_proj will not be fused + # in the safetensors checkpoint. So, we convert the name + # from the fused version to unfused + check to make sure that + # each shard of the fused layer has the same scheme. + if proj_name in fused_mapping: shard_prefixes = [ prefix.replace(proj_name, shard_proj_name) - for shard_proj_name in FUSED_LAYER_NAME_MAPPING[proj_name] + for shard_proj_name in fused_mapping[proj_name] ] is_skipped = None diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 19e3bc6a2..2a2c2523b 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -43,6 +43,7 @@ from vllm.model_executor.model_loader.tensorizer import ( TensorizerConfig, is_vllm_tensorized, load_with_tensorizer, serialize_vllm_model, tensorizer_weights_iterator) from vllm.model_executor.model_loader.utils import (ParamMapping, + configure_quant_config, get_model_architecture, set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( @@ -113,6 +114,9 @@ def _initialize_model( model_config = vllm_config.model_config model_class, _ = get_model_architecture(model_config) + if vllm_config.quant_config is not None: + configure_quant_config(vllm_config.quant_config, model_class) + signatures = inspect.signature(model_class.__init__) all_params = [param.name for param in signatures.parameters.values()] if "vllm_config" in all_params and "prefix" in all_params: diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 7a82a695c..dc620d498 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -11,6 +11,8 @@ from transformers.dynamic_module_utils import get_class_from_dynamic_module from vllm.config import ModelConfig, ModelImpl from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) from vllm.model_executor.models import ModelRegistry from vllm.model_executor.models.adapters import (as_classification_model, as_embedding_model, @@ -138,3 +140,23 @@ class ParamMapping: if module_name.endswith(key): return key, value return None + + +def configure_quant_config(quant_config: QuantizationConfig, + model_class: Type[nn.Module]): + """ + Pass packed_modules_mapping by reference to quant_config so that + quant_config can properly match fused modules + + Note that model attributes are passed by reference to quant_config, + enabling them to be updated by model_class.__new__ (ex. chatglm, qwen) + """ + packed_mapping = getattr(model_class, "packed_modules_mapping", None) + if packed_mapping is not None: + # pass packed_modules_mapping by reference to quant_config + quant_config.packed_modules_mapping = packed_mapping + else: + logger.warning( + "The model class %s has not defined `packed_modules_mapping`, " + "this may lead to incorrect mapping of quantized or ignored " + "modules", model_class.__name__) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index b81a9e917..a31648675 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -265,12 +265,14 @@ class GLMAttention(nn.Module): self.total_num_kv_heads, bias=config.add_bias_linear or config.add_qkv_bias, quant_config=quant_config, + prefix=f"{prefix}.query_key_value", ) self.dense = RowParallelLinear( self.total_num_heads * self.head_dim, config.hidden_size, bias=config.add_bias_linear, quant_config=quant_config, + prefix=f"{prefix}.dense", ) # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141 @@ -327,6 +329,7 @@ class GLMMLP(nn.Module): self, config: ChatGLMConfig, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() @@ -338,6 +341,7 @@ class GLMMLP(nn.Module): [config.ffn_hidden_size] * 2, bias=config.add_bias_linear, quant_config=quant_config, + prefix=f"{prefix}.dense_h_to_4h", ) self.activation_func = SiluAndMul() @@ -348,6 +352,7 @@ class GLMMLP(nn.Module): config.hidden_size, bias=config.add_bias_linear, quant_config=quant_config, + prefix=f"{prefix}.dense_4h_to_h", ) def forward(self, hidden_states): @@ -396,7 +401,7 @@ class GLMBlock(nn.Module): config.hidden_size, eps=config.layernorm_epsilon) # MLP - self.mlp = GLMMLP(config, quant_config) + self.mlp = GLMMLP(config, quant_config, prefix=f"{prefix}.mlp") def forward( self, @@ -507,7 +512,8 @@ class ChatGLMModel(nn.Module): self.embedding = VocabParallelEmbedding(config.padded_vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.embedding") self.num_layers = config.num_layers self.multi_query_group_num = config.multi_query_group_num @@ -766,6 +772,7 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, SupportsMultiModal): # Ensure that the LoRA support check passes when the class is not # initialized, but set all these attributes to empty. + # These will be updated when an instance class is selected packed_modules_mapping = {} supported_lora_modules = [] embedding_modules = {} @@ -777,9 +784,18 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, prefix: str = "", ) -> None: config = vllm_config.model_config.hf_config + # Initialize VL - if hasattr(config, "vision_config"): - return ChatGLMV(vllm_config=vllm_config, prefix=prefix) + if hasattr(config, "vision_config"): # noqa: SIM108 + instance_cls = ChatGLMV # Initialize LLM else: - return ChatGLM(vllm_config=vllm_config, prefix=prefix) \ No newline at end of file + instance_cls = ChatGLM + + # quant_config references base class members, + # so update values before init is called + cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping) + cls.supported_lora_modules += instance_cls.supported_lora_modules + cls.embedding_modules.update(instance_cls.embedding_modules) + cls.embedding_padding_modules += instance_cls.embedding_padding_modules + return instance_cls(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py index 4449eb8e8..2facd1353 100644 --- a/vllm/model_executor/models/glm4_vision_encoder.py +++ b/vllm/model_executor/models/glm4_vision_encoder.py @@ -74,11 +74,13 @@ class Attention(nn.Module): self.head_dim, config.num_heads, quant_config=quant_config, + prefix=f"{prefix}.query_key_value", ) self.dense = RowParallelLinear( config.hidden_size, config.hidden_size, quant_config=quant_config, + prefix=f"{prefix}.dense", ) self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim, @@ -101,6 +103,7 @@ class MLP(nn.Module): self, config, quant_config: Optional[QuantizationConfig] = None, + prefix: str = '', ): super().__init__() self.config = config @@ -109,11 +112,13 @@ class MLP(nn.Module): config.hidden_size, config.intermediate_size, quant_config=quant_config, + prefix=f"{prefix}.fc1", ) self.fc2 = RowParallelLinear( config.intermediate_size, config.hidden_size, quant_config=quant_config, + prefix=f"{prefix}.fc2", ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -137,7 +142,9 @@ class TransformerLayer(nn.Module): self.attention = Attention(config, quant_config=quant_config, prefix=f"{prefix}.attention") - self.mlp = MLP(config, quant_config=quant_config) + self.mlp = MLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.post_attention_layernorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -164,7 +171,7 @@ class Transformer(nn.Module): self.layers = nn.ModuleList([ TransformerLayer(config, quant_config=quant_config, - prefix=f"{prefix}.layer.{layer_idx}") + prefix=f"{prefix}.layers.{layer_idx}") for layer_idx in range(config.num_hidden_layers) ]) @@ -181,6 +188,7 @@ class GLU(nn.Module): config, in_features, quant_config: Optional[QuantizationConfig] = None, + prefix: str = '', ): """ The original implementation is the same as: @@ -222,7 +230,8 @@ class GLU(nn.Module): self.linear_proj = ReplicatedLinear(in_features, config.hidden_size, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.linear_proj") self.norm1 = nn.LayerNorm(config.hidden_size) self.act1 = nn.GELU() self.act2 = SiluAndMul() @@ -230,12 +239,15 @@ class GLU(nn.Module): self.merged_proj = MergedColumnParallelLinear( config.hidden_size, [config.ffn_hidden_size] * 2, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.merged_proj") - self.dense_4h_to_h = RowParallelLinear(config.ffn_hidden_size, - config.hidden_size, - bias=False, - quant_config=quant_config) + self.dense_4h_to_h = RowParallelLinear( + config.ffn_hidden_size, + config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.dense_4h_to_h") def forward(self, x): x, _ = self.linear_proj(x) @@ -262,7 +274,8 @@ class EVA2CLIPModel(nn.Module): prefix=f"{prefix}.transformer") self.linear_proj = GLU(config, in_features=config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.linear_proj") self.conv = nn.Conv2d(in_channels=vision_config.hidden_size, out_channels=config.hidden_size, kernel_size=2, diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 3d16d635b..20f3a3d19 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1473,6 +1473,7 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA): """ # Ensure that the LoRA support check passes when the class is not # initialized, but set all these attributes to empty. + # These will be updated when an instance class is selected packed_modules_mapping = {} supported_lora_modules = [] embedding_modules = {} @@ -1489,8 +1490,15 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA): version = str(config.version).split(".") version = tuple([int(x) for x in version]) # Dispatch class based on version - instance_class = _SUPPORT_VERSION.get(version) - if instance_class is None: + instance_cls = _SUPPORT_VERSION.get(version) + if instance_cls is None: raise ValueError( "Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6") - return instance_class(vllm_config=vllm_config, prefix=prefix) + + # quant_config references base class members, + # so update values before init is called + cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping) + cls.supported_lora_modules += instance_cls.supported_lora_modules + cls.embedding_modules.update(instance_cls.embedding_modules) + cls.embedding_padding_modules += instance_cls.embedding_padding_modules + return instance_cls(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 327fad0f5..897066124 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -1135,6 +1135,7 @@ class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA): """ # Ensure that the LoRA support check passes when the class is not # initialized, but set all these attributes to empty. + # These will be updated when an instance class is selected packed_modules_mapping = {} supported_lora_modules = [] embedding_modules = {} @@ -1146,9 +1147,18 @@ class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA): prefix: str = "", ) -> QWenBaseModel: config = vllm_config.model_config.hf_config + # Initialize VL - if hasattr(config, "visual"): - return QWenVL(vllm_config=vllm_config, prefix=prefix) + if hasattr(config, "visual"): # noqa: SIM108 + instance_cls = QWenVL # Initialize LLM else: - return QWenLLM(vllm_config=vllm_config, prefix=prefix) + instance_cls = QWenLLM + + # quant_config references base class members, + # so update values before init is called + cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping) + cls.supported_lora_modules += instance_cls.supported_lora_modules + cls.embedding_modules.update(instance_cls.embedding_modules) + cls.embedding_padding_modules += instance_cls.embedding_padding_modules + return instance_cls(vllm_config=vllm_config, prefix=prefix) -- GitLab From 58b218d7ae91340a70a2a961d03f6e49315c2cfa Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 5 Feb 2025 01:42:09 -0500 Subject: [PATCH 45/65] [Doc] Update PR Reminder with link to Developer Slack (#12748) --- .github/workflows/reminder_comment.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml index df62539c0..27318c2fd 100644 --- a/.github/workflows/reminder_comment.yml +++ b/.github/workflows/reminder_comment.yml @@ -2,7 +2,6 @@ name: PR Reminder Comment Bot on: pull_request_target: types: [opened] - jobs: pr_reminder: runs-on: ubuntu-latest @@ -15,7 +14,12 @@ jobs: owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, - body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀' + body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' + + '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' + + 'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' + + 'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' + + 'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' + + '🚀' }) env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} -- GitLab From fcf2e3d7fcc9898b7a1b26bacea22753ab76f3a6 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 5 Feb 2025 06:42:46 +0000 Subject: [PATCH 46/65] [Bugfix] Fix OpenVINO model runner (#12750) --- vllm/attention/backends/openvino.py | 4 ++++ vllm/model_executor/model_loader/openvino.py | 11 +++++------ vllm/worker/openvino_model_runner.py | 9 +++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py index f58528dbf..9908620a3 100644 --- a/vllm/attention/backends/openvino.py +++ b/vllm/attention/backends/openvino.py @@ -140,3 +140,7 @@ class OpenVINOAttentionMetadata: # `model_executable`. multi_modal_placeholder_index_maps: Optional[Dict[ str, MultiModalPlaceholderMap.IndexMap]] + + # Enable/disable KV scales calculation. This is so that we can disable the + # calculation until after prefill and cuda graph capture. + enable_kv_scales_calculation: bool diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py index 7bd531c56..fde200d57 100644 --- a/vllm/model_executor/model_loader/openvino.py +++ b/vllm/model_executor/model_loader/openvino.py @@ -13,7 +13,7 @@ from torch import nn import vllm.envs as envs from vllm.attention.backends.openvino import OpenVINOAttentionMetadata -from vllm.config import DeviceConfig, ModelConfig +from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.logits_processor import (LogitsProcessor, _prune_hidden_states) @@ -103,7 +103,6 @@ class OpenVINOCausalLM(nn.Module): self, ov_core: ov.Core, model_config: ModelConfig, - device_config: DeviceConfig, kv_cache_dtype: ov.Type, ) -> None: super().__init__() @@ -187,8 +186,7 @@ class OpenVINOCausalLM(nn.Module): def get_model( - model_config: ModelConfig, - device_config: DeviceConfig, + vllm_config: VllmConfig, kv_cache_dtype: ov.Type, **kwargs, ) -> torch.nn.Module: @@ -201,5 +199,6 @@ def get_model( "be added in the future. If this is important to you, " "please open an issue on github.") - return OpenVINOCausalLM(ov_core, model_config, device_config, - kv_cache_dtype) + with set_current_vllm_config(vllm_config): + return OpenVINOCausalLM(ov_core, vllm_config.model_config, + kv_cache_dtype) diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index 44442cddb..f7a5ab9de 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -54,15 +54,13 @@ class OpenVINOModelRunner(ModelRunnerBase): ): self.ov_core = ov_core ModelRunnerBase.__init__(self, vllm_config=vllm_config) - cache_config = self.cache_config - model_config = self.model_config self.is_driver_worker = is_driver_worker self.device = self.device_config.device self.kv_cache_dtype = kv_cache_dtype - self.sliding_window = model_config.get_sliding_window() - self.block_size = cache_config.block_size + self.sliding_window = self.model_config.get_sliding_window() + self.block_size = self.cache_config.block_size self.attn_backend = get_attn_backend( self.model_config.get_head_size(), @@ -81,8 +79,7 @@ class OpenVINOModelRunner(ModelRunnerBase): self.model: nn.Module # Set after init_Model def load_model(self) -> None: - self.model = get_model(model_config=self.model_config, - device_config=self.device_config, + self.model = get_model(vllm_config=self.vllm_config, kv_cache_dtype=self.kv_cache_dtype, ov_core=self.ov_core) -- GitLab From 3d09e592a860982acef0edef858078d28d393e84 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 4 Feb 2025 22:43:02 -0800 Subject: [PATCH 47/65] [V1][Misc] Shorten `FinishReason` enum and use constant strings (#12760) --- vllm/v1/engine/__init__.py | 12 +++++++++--- vllm/v1/engine/detokenizer.py | 7 +++---- vllm/v1/metrics/loggers.py | 6 +++--- vllm/v1/metrics/stats.py | 7 +++---- vllm/v1/request.py | 14 +++++++------- 5 files changed, 25 insertions(+), 21 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 6bd548bdc..d5933cac5 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -14,11 +14,17 @@ if TYPE_CHECKING: from vllm.multimodal.inputs import PlaceholderRange from vllm.sampling_params import SamplingParams +# These are possible values of RequestOutput.finish_reason, +# so form part of the external API. +FINISH_REASON_STRINGS = ("stop", "length", "abort") -class RequestFinishedReason(enum.IntEnum): + +class FinishReason(enum.IntEnum): """ Reason a request finished - stop, length, or abort. + Int rather than Str for more compact serialization. + stop - a stop string was emitted length - max_tokens was consumed, or max_model_len was reached abort - aborted for another reason @@ -29,7 +35,7 @@ class RequestFinishedReason(enum.IntEnum): ABORT = 2 def __str__(self): - return self.name.lower() + return FINISH_REASON_STRINGS[self.value] @dataclass @@ -62,7 +68,7 @@ class EngineCoreOutput( request_id: str new_token_ids: List[int] finished: bool - finish_reason: Optional[RequestFinishedReason] = None + finish_reason: Optional[FinishReason] = None stop_reason: Union[int, str, None] = None diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 2bce23e68..861fcb012 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -8,8 +8,7 @@ from vllm.logger import init_logger from vllm.sampling_params import RequestOutputKind from vllm.transformers_utils.detokenizer_utils import ( AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) -from vllm.v1.engine import (EngineCoreOutput, EngineCoreRequest, - RequestFinishedReason) +from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason logger = init_logger(__name__) @@ -19,7 +18,7 @@ class DetokenizerOutput: output_text: str token_ids: List[int] finished: bool - finish_reason: Optional[RequestFinishedReason] = None + finish_reason: Optional[FinishReason] = None stop_reason: Union[int, str, None] = None @@ -148,7 +147,7 @@ class IncrementalDetokenizer: stop_str, truncate_to = stop if truncate_to != -1: self.output_text = self.output_text[:truncate_to] - finish_reason = RequestFinishedReason.STOP + finish_reason = FinishReason.STOP stop_reason = stop_str # TODO: handle stop_token_ids here too? diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index b62351a8f..eb1acf584 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -9,7 +9,7 @@ import prometheus_client from vllm.config import ModelConfig from vllm.logger import init_logger -from vllm.v1.engine import RequestFinishedReason +from vllm.v1.engine import FinishReason from vllm.v1.metrics.stats import IterationStats, SchedulerStats logger = init_logger(__name__) @@ -117,13 +117,13 @@ class PrometheusStatLogger(StatLoggerBase): documentation="Number of generation tokens processed.", labelnames=labelnames).labels(*labelvalues) - self.counter_request_success: Dict[RequestFinishedReason, + self.counter_request_success: Dict[FinishReason, prometheus_client.Counter] = {} counter_request_success_base = prometheus_client.Counter( name="vllm:request_success_total", documentation="Count of successfully processed requests.", labelnames=labelnames + ["finished_reason"]) - for reason in RequestFinishedReason: + for reason in FinishReason: self.counter_request_success[ reason] = counter_request_success_base.labels(*(labelvalues + [str(reason)])) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 36c95e07d..e3f1efcc9 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List if TYPE_CHECKING: from vllm.outputs import RequestOutput - from vllm.v1.engine import EngineCoreOutput, RequestFinishedReason + from vllm.v1.engine import EngineCoreOutput, FinishReason @dataclass @@ -32,7 +32,7 @@ class RequestStateStats: class FinishedRequestStats: """Stats associated with a finished request.""" - finish_reason: "RequestFinishedReason" + finish_reason: "FinishReason" num_prompt_tokens: int = 0 num_generation_tokens: int = 0 @@ -74,8 +74,7 @@ class IterationStats: request_state_stats.num_generation_tokens += num_new_generation_tokens request_state_stats.last_token_time = now - def update_from_finished_request(self, - finish_reason: "RequestFinishedReason", + def update_from_finished_request(self, finish_reason: "FinishReason", request_output: "RequestOutput", request_state_stats: RequestStateStats): self.finished_requests.append( diff --git a/vllm/v1/request.py b/vllm/v1/request.py index eb9bf99b4..89b39ea61 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List, Optional, Union from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams from vllm.sequence import RequestMetrics -from vllm.v1.engine import EngineCoreRequest, RequestFinishedReason +from vllm.v1.engine import EngineCoreRequest, FinishReason from vllm.v1.utils import ConstantList if TYPE_CHECKING: @@ -109,7 +109,7 @@ class Request: def is_finished(self) -> bool: return RequestStatus.is_finished(self.status) - def get_finished_reason(self) -> Union[RequestFinishedReason, None]: + def get_finished_reason(self) -> Union[FinishReason, None]: return RequestStatus.get_finished_reason(self.status) def has_encoder_inputs(self) -> bool: @@ -150,7 +150,7 @@ class RequestStatus(enum.IntEnum): @staticmethod def get_finished_reason( - status: "RequestStatus") -> Union[RequestFinishedReason, None]: + status: "RequestStatus") -> Union[FinishReason, None]: return _FINISHED_REASON_MAP.get(status) @@ -159,8 +159,8 @@ class RequestStatus(enum.IntEnum): # are longer than the model's length cap. Therefore, the stop # reason should also be "length" as in OpenAI API. _FINISHED_REASON_MAP = { - RequestStatus.FINISHED_STOPPED: RequestFinishedReason.STOP, - RequestStatus.FINISHED_LENGTH_CAPPED: RequestFinishedReason.LENGTH, - RequestStatus.FINISHED_ABORTED: RequestFinishedReason.ABORT, - RequestStatus.FINISHED_IGNORED: RequestFinishedReason.LENGTH, + RequestStatus.FINISHED_STOPPED: FinishReason.STOP, + RequestStatus.FINISHED_LENGTH_CAPPED: FinishReason.LENGTH, + RequestStatus.FINISHED_ABORTED: FinishReason.ABORT, + RequestStatus.FINISHED_IGNORED: FinishReason.LENGTH, } -- GitLab From c53dc466b1b802d45d0b61cce36908334ea7a23e Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 5 Feb 2025 01:43:11 -0500 Subject: [PATCH 48/65] [Doc] Remove performance warning for auto_awq.md (#12743) --- docs/source/features/quantization/auto_awq.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md index 30735b116..fa0bebeb8 100644 --- a/docs/source/features/quantization/auto_awq.md +++ b/docs/source/features/quantization/auto_awq.md @@ -2,12 +2,6 @@ # AutoAWQ -:::{warning} -Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better -accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency -inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version. -::: - To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ). Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%. The main benefits are lower latency and memory usage. -- GitLab From 022bcc701a948f96e68af678eee686837f393d07 Mon Sep 17 00:00:00 2001 From: Akash kaothalkar <61960177+Akashcodes732@users.noreply.github.com> Date: Wed, 5 Feb 2025 12:41:02 +0530 Subject: [PATCH 49/65] [Bugfix] Fix 'ModuleNotFoundError: No module named 'intel_extension_for_pytorch'' for --tensor-parallel-size more than 1 (#12546) --- vllm/distributed/parallel_state.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index c5c5dfbba..321902d11 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -329,9 +329,17 @@ class GroupCoordinator: return input_ if input_.is_cpu: - import intel_extension_for_pytorch as ipex - ipex.distributed.all_reduce(input_, group=self.device_group) - return input_ + try: + import intel_extension_for_pytorch as ipex + ipex.distributed.all_reduce(input_, group=self.device_group) + return input_ + except ImportError: + """ + Intel IPEX not found. Falling back to PyTorch native + all_reduce for CPU + """ + torch.distributed.all_reduce(input_, group=self.device_group) + return input_ if self.tpu_communicator is not None and \ not self.tpu_communicator.disabled: -- GitLab From bc1bdecebf76cca0dfafe4924d529b30c8a24795 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 6 Feb 2025 02:03:19 +0800 Subject: [PATCH 50/65] [core][distributed] exact ray placement control (#12732) Signed-off-by: youkaichao --- .buildkite/test-pipeline.yaml | 2 + examples/offline_inference/ray_placement.py | 121 ++++++++++++++++++++ vllm/envs.py | 14 +++ vllm/executor/ray_distributed_executor.py | 36 +++--- vllm/platforms/cuda.py | 8 ++ vllm/platforms/interface.py | 5 + 6 files changed, 173 insertions(+), 13 deletions(-) create mode 100644 examples/offline_inference/ray_placement.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a847a68a6..7ef40564c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -128,6 +128,7 @@ steps: - tests/spec_decode/e2e/test_integration_dist_tp4 - tests/compile - examples/offline_inference/rlhf.py + - examples/offline_inference/ray_placement.py commands: - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py @@ -136,6 +137,7 @@ steps: # TODO: create a dedicated test section for multi-GPU example tests # when we have multiple distributed example tests - python3 ../examples/offline_inference/rlhf.py + - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/ray_placement.py - label: Metrics, Tracing Test # 10min num_gpus: 2 diff --git a/examples/offline_inference/ray_placement.py b/examples/offline_inference/ray_placement.py new file mode 100644 index 000000000..cd801a3c0 --- /dev/null +++ b/examples/offline_inference/ray_placement.py @@ -0,0 +1,121 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +a simple demonstration to show how to control +the placement of the vLLM workers with Ray. +The key is to set VLLM_RAY_PER_WORKER_GPUS and +VLLM_RAY_BUNDLE_INDICES properly. +""" +import os + +import ray +from ray.util.placement_group import placement_group +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +from vllm import LLM +from vllm.worker.worker import Worker + + +class MyWorker(Worker): + + def report_device_id(self) -> str: + from vllm.platforms import current_platform + return current_platform.get_device_uuid(self.device.index) + + +class MyLLM(LLM): + + def __init__(self, *args, bundle_indices: list, **kwargs): + # a hack to make the script work. + # stop ray from manipulating CUDA_VISIBLE_DEVICES + # at the top-level + del os.environ["CUDA_VISIBLE_DEVICES"] + # every worker will use 0.4 GPU, so that we can schedule + # 2 instances on the same GPUs. + os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4" + os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join( + map(str, bundle_indices)) + print(f"creating LLM with bundle_indices={bundle_indices}") + super().__init__(*args, **kwargs) + + +class RayTrainingActor: + + def report_device_id(self) -> str: + # the argument for get_device_uuid is the index + # of the GPU in the visible devices. + # ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs + from vllm.platforms import current_platform + return current_platform.get_device_uuid(0) + + +# ray manages 4 GPUs +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" +ray.init() + +# we want to co-locate vLLM instance and the training actor +# on the same set of GPUs. +# the placement plan is as follows: +# GPU 0 and 1: training actor 0, 1, and vLLM instance 0 (with TP=2) +# GPU 2 and 3: training actor 2, 3, and vLLM instance 1 (with TP=2) + +pg = placement_group([{"GPU": 1, "CPU": 0}] * 4) +ray.get(pg.ready()) +print(f"placement group has bundles {pg.bundle_specs=}") + +training_actors = [] +training_actor_device_ids = [] +inference_engines = [] +inference_engine_device_ids = [] + +for bundle_index in [0, 1, 2, 3]: + training_actor = ray.remote( + num_cpus=0, + num_gpus=0.4, + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=pg, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_index, + ), + )(RayTrainingActor).remote() + training_actors.append(training_actor) + device_id = ray.get(training_actor.report_device_id.remote()) + print(f"training actor {bundle_index} is on {device_id}") + training_actor_device_ids.append(device_id) + +for (i, bundle_indices) in enumerate([[0, 1], [2, 3]]): + # IMPORTANT: when creating vLLM instances, we need to + # make sure there are no GPU activities on the target GPUs, + # otherwise, they will interfere with the vLLM memory profiling, + # and cause unexpected behaviors. + llm = ray.remote( + num_cpus=0, + num_gpus=0, + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=pg, + placement_group_capture_child_tasks=True, + ), + )(MyLLM).remote( + model="facebook/opt-125m", + enforce_eager=True, + worker_cls=MyWorker, + tensor_parallel_size=2, + distributed_executor_backend="ray", + gpu_memory_utilization=0.4, + bundle_indices=bundle_indices, + ) + inference_engines.append(llm) + # don't call any method on the inference engine here, + # otherwise it will block until the vLLM instance is created. + +for i, llm in enumerate(inference_engines): + inference_engine_device_ids.append( + ray.get(llm.collective_rpc.remote("report_device_id", args=tuple()))) + print(f"inference engine {i} is on {inference_engine_device_ids[-1]}") + +# check the placement +# the first two training actors should be +# on the same GPUs as the first inference engine +assert training_actor_device_ids[:2] == inference_engine_device_ids[0] +# the last two training actors should be +# on the same GPUs as the second inference engine +assert training_actor_device_ids[2:] == inference_engine_device_ids[1] diff --git a/vllm/envs.py b/vllm/envs.py index bb419dacb..745b068b7 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -85,6 +85,8 @@ if TYPE_CHECKING: VLLM_MLA_DISABLE_REQUANTIZATION: bool = False VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False + VLLM_RAY_PER_WORKER_GPUS: float = 1.0 + VLLM_RAY_BUNDLE_INDICES: str = "" def get_default_cache_root(): @@ -550,6 +552,18 @@ environment_variables: Dict[str, Callable[[], Any]] = { lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0")) ), + # Number of GPUs per worker in Ray, if it is set to be a fraction, + # it allows ray to schedule multiple actors on a single GPU, + # so that users can colocate other actors on the same GPUs as vLLM. + "VLLM_RAY_PER_WORKER_GPUS": + lambda: float(os.getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")), + + # Bundle indices for Ray, if it is set, it can control precisely + # which indices are used for the Ray bundle, for every worker. + # Format: comma-separated list of integers, e.g. "0,1,2,3" + "VLLM_RAY_BUNDLE_INDICES": + lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""), + # When on a Nvidia GPU aligns single entries (within a page) so they are 256 # byte aligned for better performance, this increases the memory usage of # the cache. Currently this only affects MLA that results in non-256 diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index 80e7a1c40..6a25a4d50 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -129,13 +129,7 @@ class RayDistributedExecutor(DistributedExecutorBase): def _init_workers_ray(self, placement_group: "PlacementGroup", **ray_remote_kwargs): - if (self.parallel_config.tensor_parallel_size == 1 - and self.parallel_config.pipeline_parallel_size == 1): - # For single GPU case, we use a ray worker with constrained memory. - num_gpus = self.cache_config.gpu_memory_utilization - else: - # Otherwise, the ray workers are allocated with a full GPU. - num_gpus = 1 + num_gpus = envs.VLLM_RAY_PER_WORKER_GPUS # The driver dummy worker does not actually use any resources. # It holds the resource for the driver worker. @@ -155,12 +149,29 @@ class RayDistributedExecutor(DistributedExecutorBase): logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker) # Create the workers. - driver_ip = get_ip() - rank = 0 + bundle_indices: List[int] + if envs.VLLM_RAY_BUNDLE_INDICES: + # Use the bundle indices specified by the user. + bundle_indices = list( + map(int, envs.VLLM_RAY_BUNDLE_INDICES.split(","))) + assert len(bundle_indices) == self.parallel_config.world_size, \ + ("VLLM_RAY_BUNDLE_INDICES must have the same size" + f" as the world size, but got {bundle_indices=} " + f"and {self.parallel_config.world_size=}") + assert len(set(bundle_indices)) == len(bundle_indices), \ + ("VLLM_RAY_BUNDLE_INDICES cannot have duplicate values," + f" but got {bundle_indices=}") + else: + # use the first N bundles that have GPU resources. + bundle_indices = [] + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if bundle.get(current_platform.ray_device_key, 0): + bundle_indices.append(bundle_id) + bundle_indices = bundle_indices[:self.parallel_config.world_size] + worker_metadata: List[RayWorkerMetaData] = [] - for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get(current_platform.ray_device_key, 0): - continue + driver_ip = get_ip() + for rank, bundle_id in enumerate(bundle_indices): scheduling_strategy = PlacementGroupSchedulingStrategy( placement_group=placement_group, placement_group_capture_child_tasks=True, @@ -187,7 +198,6 @@ class RayDistributedExecutor(DistributedExecutorBase): rpc_rank=rank) worker_metadata.append( RayWorkerMetaData(worker=worker, created_rank=rank)) - rank += 1 worker_ips = ray.get([ each.worker.get_node_ip.remote() # type: ignore[attr-defined] diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index b49852a72..991d55ac8 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -275,6 +275,14 @@ class NvmlCudaPlatform(CudaPlatformBase): physical_device_id = device_id_to_physical_device_id(device_id) return cls._get_physical_device_name(physical_device_id) + @classmethod + @lru_cache(maxsize=8) + @with_nvml_context + def get_device_uuid(cls, device_id: int = 0) -> str: + physical_device_id = device_id_to_physical_device_id(device_id) + handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id) + return pynvml.nvmlDeviceGetUUID(handle) + @classmethod @lru_cache(maxsize=8) @with_nvml_context diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index dc6545c93..211e288b1 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -183,6 +183,11 @@ class Platform: """Get the name of a device.""" raise NotImplementedError + @classmethod + def get_device_uuid(cls, device_id: int = 0) -> str: + """Get the uuid of a device, e.g. the PCI bus ID.""" + raise NotImplementedError + @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: """Get the total memory of a device in bytes.""" -- GitLab From 4c3aac51e14214880a3205b45c26aa16535992b5 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Thu, 6 Feb 2025 05:24:26 +0800 Subject: [PATCH 51/65] Merging PR #12536 Merged via CLI script --- vllm/attention/layer.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 19ee89630..e4df7ffc5 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -156,9 +156,13 @@ class Attention(nn.Module): kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, ) -> torch.Tensor: - if self.calculate_kv_scales and \ - attn_metadata.enable_kv_scales_calculation: - self.calc_kv_scales(key, value) + # NOTE: please avoid accessing `kv_cache` and `attn_metadata` arguments + # directly, use `self.kv_cache` and + # `get_forward_context().attn_metadata` instead. + if self.calculate_kv_scales: + ctx_attn_metadata = get_forward_context().attn_metadata + if ctx_attn_metadata.enable_kv_scales_calculation: + self.calc_kv_scales(key, value) if self.use_output: output = torch.empty_like(query) hidden_size = query.size(-1) @@ -172,15 +176,27 @@ class Attention(nn.Module): if value is not None: value = value.view(-1, self.num_kv_heads, self.head_size) if self.use_direct_call: - unified_attention_with_output(query, key, value, output, - self.layer_name) + forward_context: ForwardContext = get_forward_context() + ctx_attn_metadata = forward_context.attn_metadata + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + self.impl.forward(self, + query, + key, + value, + self_kv_cache, + ctx_attn_metadata, + output=output) else: torch.ops.vllm.unified_attention_with_output( query, key, value, output, self.layer_name) return output.view(-1, hidden_size) else: if self.use_direct_call: - return unified_attention(query, key, value, self.layer_name) + forward_context = get_forward_context() + ctx_attn_metadata = forward_context.attn_metadata + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + return self.impl.forward(self, query, key, value, + self_kv_cache, ctx_attn_metadata) else: return torch.ops.vllm.unified_attention( query, key, value, self.layer_name) -- GitLab From af8486de49a200a980f71fddc6d1eb4d8f9f1bca Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Thu, 6 Feb 2025 02:59:45 +0530 Subject: [PATCH 52/65] [Hardware][Intel-Gaudi] Enable FusedSDPA support for Intel Gaudi (HPU) --- vllm/attention/backends/hpu_attn.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index 1518e518e..1ad5e6e8e 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -10,7 +10,8 @@ from typing import Any, Dict, List, Optional, Tuple, Type import torch import vllm_hpu_extension.ops as ops -from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache +from vllm_hpu_extension.utils import (Matmul, ModuleFusedSDPA, Softmax, + VLLMKVCache) from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionLayer, @@ -137,9 +138,17 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): self.prefill_usefusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', '0').lower() in ['1', 'true'] + self.fused_scaled_dot_product_attention = None if self.prefill_usefusedsdpa: assert alibi_slopes is None, \ 'Prefill with FusedSDPA not supported with alibi slopes!' + try: + from habana_frameworks.torch.hpex.kernels import FusedSDPA + self.fused_scaled_dot_product_attention = ModuleFusedSDPA( + FusedSDPA) + except ImportError: + logger().warning("Could not import HPU FusedSDPA kernel. " + "vLLM will use native implementation.") suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes() if head_size not in suppored_head_sizes: @@ -227,6 +236,7 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): matmul_qk_op=self.matmul_qk, softmax_op=self.softmax, matmul_av_op=self.matmul_av, + fsdpa_op=self.fused_scaled_dot_product_attention, ) output = out.reshape(batch_size, seq_len, hidden_size) else: -- GitLab From 3b2005e1db79efe2ea4587035eb2e7ced6e258cb Mon Sep 17 00:00:00 2001 From: Rahul Tuli Date: Wed, 5 Feb 2025 15:30:43 -0600 Subject: [PATCH 53/65] Add: Support for Sparse24Bitmask Compressed Models --- .../SparseLlama3.1_2of4_fp8_compressed.yaml | 11 + tests/quantization/test_compressed_tensors.py | 332 +++++++++++++++--- .../compressed_tensors/compressed_tensors.py | 34 +- .../schemes/compressed_tensors_24.py | 238 ++++++++++--- 4 files changed, 503 insertions(+), 112 deletions(-) create mode 100644 .buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml diff --git a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml new file mode 100644 index 000000000..2928d75ce --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml @@ -0,0 +1,11 @@ +# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2 +model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.6353 + - name: "exact_match,flexible-extract" + value: 0.637 +limit: null +num_fewshot: null diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 7e2e6f6ed..0655f2b38 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -3,6 +3,7 @@ Run `pytest tests/quantization/test_compressed_tensors.py`. """ + from typing import Optional import pytest @@ -22,12 +23,30 @@ from vllm.platforms import current_platform @pytest.mark.parametrize( "model_args", - [("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor", - QuantizationType.INT, 2560, True), - ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel", - QuantizationType.INT, 2560, True), - ("nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", "tensor", - QuantizationType.INT, 2560, False)]) + [ + ( + "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", + "tensor", + QuantizationType.INT, + 2560, + True, + ), + ( + "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", + "channel", + QuantizationType.INT, + 2560, + True, + ), + ( + "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", + "tensor", + QuantizationType.INT, + 2560, + False, + ), + ], +) def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): model_path, strategy, quant_type, shape_0, is_symmetric = model_args with vllm_runner(model_path, enforce_eager=True) as llm: @@ -85,21 +104,31 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): assert output -@pytest.mark.parametrize("model_path", [ - "neuralmagic/Llama-3.2-1B-quantized.w8a8", - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym", - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym", - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym" -]) +@pytest.mark.parametrize( + "model_path", + [ + "neuralmagic/Llama-3.2-1B-quantized.w8a8", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym", + ], +) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [10]) -def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner, - example_prompts, model_path, - max_tokens, num_logprobs): +def test_compressed_tensors_w8a8_logprobs( + hf_runner, + vllm_runner, + example_prompts, + model_path, + max_tokens, + num_logprobs, +): dtype = "bfloat16" # skip language translation prompt for the static per tensor asym model - if model_path == "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym": # noqa: E501 + if (model_path == + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym" + ): # noqa: E501 example_prompts = example_prompts[0:-1] with hf_runner(model_path, dtype=dtype) as hf_model: @@ -125,13 +154,21 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner): assert output -@pytest.mark.parametrize("model_args", [ - ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"), - ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"), - ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"), - ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym", - "channel"), -]) +@pytest.mark.parametrize( + "model_args", + [ + ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"), + ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"), + ( + "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", + "channel", + ), + ( + "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym", + "channel", + ), + ], +) def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): model_path, strategy = model_args with vllm_runner(model_path, dtype=torch.float16) as llm: @@ -156,9 +193,12 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): @pytest.mark.parametrize( "wNa16_args", - [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8), - ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8), - ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)]) + [ + ("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8), + ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8), + ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4), + ], +) def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): model, strategy, group, pack_factor = wNa16_args with vllm_runner(model) as llm: @@ -218,7 +258,8 @@ def test_compressed_tensors_fp8(vllm_runner): CompressedTensorsLinearMethod) assert isinstance( qkv_proj.scheme, - (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8)) + (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8), + ) assert qkv_proj.input_scale.dtype is torch.float32 @@ -241,9 +282,14 @@ def test_compressed_tensors_kv_cache(vllm_runner): assert output -@pytest.mark.skipif(not sparse_cutlass_supported(), - reason="Sparse FP8 is not yet supported on this GPU type.") -def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy): +@pytest.mark.skipif( + not sparse_cutlass_supported(), + reason="Sparse FP8 is not yet supported on this GPU type.", +) +def _test_2of4_quant_models(qkv_proj, + weight_strategy, + input_strategy, + format="dense"): assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(qkv_proj.scheme, CompressedTensors24) @@ -252,22 +298,39 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy): assert qkv_proj.scheme.quantized assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501 - assert sparsity_map.get("Linear").format == "dense" + assert sparsity_map.get("Linear").format == format assert sparsity_map.get("Linear").sparsity_structure == "2:4" -@pytest.mark.skipif(not current_platform.has_device_capability(90), - reason="Sparse FP8 is not yet supported on this GPU type.") -@pytest.mark.parametrize("args_2of4", [ - ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel", - "token"), - ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing", - "channel", "tensor"), - ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", "tensor", - "tensor"), - ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing", - "tensor", "token"), -]) +@pytest.mark.skipif( + not current_platform.has_device_capability(90), + reason="Sparse FP8 is not yet supported on this GPU type.", +) +@pytest.mark.parametrize( + "args_2of4", + [ + ( + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", + "channel", + "token", + ), + ( + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing", + "channel", + "tensor", + ), + ( + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", + "tensor", + "tensor", + ), + ( + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing", + "tensor", + "token", + ), + ], +) def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 with vllm_runner(model) as llm: @@ -286,16 +349,134 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): assert output -@pytest.mark.skipif(not sparse_cutlass_supported(), - reason="Sparse FP8 is not yet supported on this GPU type.") -@pytest.mark.parametrize("args_2of4", [ - ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing", - "channel", "token"), - ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", "tensor", - "tensor"), - ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing", - "tensor", "token"), -]) +@pytest.mark.skipif( + not current_platform.has_device_capability(90), + reason="Sparse FP8 is not yet supported on this GPU type.", +) +@pytest.mark.parametrize( + "args_2of4", + [ + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM", + "channel", + "token", + ), + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM", + "channel", + "tensor", + ), + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM", + "tensor", + "token", + ), + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM", + "tensor", + "tensor", + ), + ], +) +def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4): + model, weight_strategy, input_strategy = args_2of4 + with vllm_runner(model) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn + _test_2of4_quant_models( + qkv_proj, + weight_strategy, + input_strategy, + format="sparse-24-bitmask", + ) + + llm.apply_model(check_model) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + print(output) + assert output + + +@pytest.mark.skipif( + not sparse_cutlass_supported(), + reason="cutlass is not yet supported on this GPU type.", +) +@pytest.mark.parametrize( + "args_2of4", + [ + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM", + "channel", + "token", + ), + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM", + "channel", + "tensor", + ), + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM", + "tensor", + "token", + ), + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM", + "tensor", + "tensor", + ), + ], +) +def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4): + model, weight_strategy, input_strategy = args_2of4 + with vllm_runner(model) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert qkv_proj.scheme.weights_dtype == torch.int8 + _test_2of4_quant_models( + qkv_proj, + weight_strategy, + input_strategy, + format="sparse-24-bitmask", + ) + + llm.apply_model(check_model) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + print(output) + assert output + + +@pytest.mark.skipif( + not sparse_cutlass_supported(), + reason="Sparse FP8 is not yet supported on this GPU type.", +) +@pytest.mark.parametrize( + "args_2of4", + [ + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing", + "channel", + "token", + ), + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", + "tensor", + "tensor", + ), + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing", + "tensor", + "token", + ), + ], +) def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 with vllm_runner(model) as llm: @@ -317,10 +498,12 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): @pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.") @pytest.mark.skipif( not sparse_cutlass_supported(), - reason="2of4 Sparse is not yet supported on this GPU type.") + reason="2of4 Sparse is not yet supported on this GPU type.", +) @pytest.mark.parametrize( "args_2of4", - [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")]) + [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")], +) def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4): model = args_2of4 with vllm_runner(model) as llm: @@ -337,7 +520,9 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4): assert qkv_proj.scheme.input_quant is None assert not qkv_proj.scheme.quantized assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map - sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501 + sparsity_map = ( + qkv_proj.quant_method.quantization_config.sparsity_scheme_map + ) # noqa: E501 assert sparsity_map.get("Linear").format == "dense" assert sparsity_map.get("Linear").sparsity_structure == "2:4" @@ -346,3 +531,38 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4): output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) assert output + + +@pytest.mark.skipif( + not sparse_cutlass_supported(), + reason="Cutlass is not yet supported on this GPU type.", +) +@pytest.mark.parametrize( + "args_2of4", [("nm-testing/llama2.c-stories42M-pruned2.4-compressed")]) +def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4): + model = args_2of4 + with vllm_runner(model) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensors24) + + assert qkv_proj.scheme.weight_quant is None + assert qkv_proj.scheme.input_quant is None + assert not qkv_proj.scheme.quantized + assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map + sparsity_map = ( + qkv_proj.quant_method.quantization_config.sparsity_scheme_map + ) # noqa: E501 + assert sparsity_map.get("Linear").format == "sparse-24-bitmask" + assert sparsity_map.get("Linear").sparsity_structure == "2:4" + + llm.apply_model(check_model) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + print(output) + assert output diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 0e3258e4a..6ee3e9362 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -417,15 +417,22 @@ class CompressedTensorsConfig(QuantizationConfig): return None # Have a valid sparsity scheme # Validate layer is supported by Cutlass 2:4 Kernel - scheme = CompressedTensors24(quantized=weight_quant is not None - or input_quant is not None, - weight_quant=weight_quant, - input_quant=input_quant) + model_compression_config = (None if sparsity_scheme is None + or sparsity_scheme.format == "dense" + else self.config) + + scheme = CompressedTensors24( + quantized=weight_quant is not None or input_quant is not None, + weight_quant=weight_quant, + input_quant=input_quant, + model_compression_config=model_compression_config, + ) elif weight_quant is None: logger.warning_once("Acceleration for non-quantized schemes is " "not supported by Compressed Tensors. " "Falling back to UnquantizedLinearMethod") return None + else: # Find the quant_scheme scheme = self._get_scheme_from_parts( # type: ignore @@ -475,10 +482,21 @@ class CompressedTensorsConfig(QuantizationConfig): :return: True if the layer is supported by the Cutlass 2:4 Kernel False otherwise """ - is_valid_sparsity = (sparsity_scheme is not None - and sparsity_scheme.sparsity_structure - == SparsityStructure.TWO_FOUR.value - and sparsity_scheme.format == "dense") + if sparsity_scheme is None: + return False + + is_valid_sparsity_structure: bool = ( + sparsity_scheme.sparsity_structure == + SparsityStructure.TWO_FOUR.value) + + valid_compressors = { + CompressionFormat.dense.value, + CompressionFormat.sparse_24_bitmask.value + } + + is_valid_sparsity = (is_valid_sparsity_structure + and sparsity_scheme.format in valid_compressors) + if not is_valid_sparsity: return False diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py index 84f924b23..0fb8dfa96 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py @@ -1,13 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, List, Optional +from typing import Any, Callable, Dict, List, Optional, Tuple import torch +from compressed_tensors import CompressionFormat, ModelCompressor from compressed_tensors.quantization import (QuantizationArgs, QuantizationStrategy, QuantizationType) +from compressed_tensors.utils import combine_shards from vllm import _custom_ops as ops +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( @@ -22,26 +26,39 @@ __all__ = ["CompressedTensors24"] class CompressedTensors24(CompressedTensorsScheme): - def __init__(self, - quantized: bool = False, - weight_quant: Optional[QuantizationArgs] = None, - input_quant: Optional[QuantizationArgs] = None): - + def __init__( + self, + quantized: bool = False, + weight_quant: Optional[QuantizationArgs] = None, + input_quant: Optional[QuantizationArgs] = None, + model_compression_config: Optional[Dict[str, Any]] = None, + ): self.quantized = quantized self.weight_quant = weight_quant self.input_quant = input_quant + self.model_compressor = ( + ModelCompressor.from_compression_config(model_compression_config) + if model_compression_config is not None else None) + self.do_sparse_decompress = ( + self.model_compressor is not None + and self.model_compressor.sparsity_config.format + == CompressionFormat.sparse_24_bitmask.value) @classmethod def get_min_capability(cls) -> int: # Only cutlass 3.x kernels are implemented so far return 90 - def create_weights(self, layer: torch.nn.Module, input_size: int, - output_partition_sizes: List[int], - input_size_per_partition: int, - params_dtype: torch.dtype, weight_loader: Callable, - **kwargs): - + def create_weights( + self, + layer: torch.nn.Module, + input_size: int, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, + weight_loader: Callable, + **kwargs, + ): if not sparse_cutlass_supported(): raise ValueError( "Sparse CUTLASS not supported. vLLM must be built with " @@ -49,16 +66,56 @@ class CompressedTensors24(CompressedTensorsScheme): self.output_dtype = params_dtype layer.logical_widths = output_partition_sizes + layer.input_size = input_size + layer.input_size_per_partition = input_size_per_partition self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype) # parameter to store uncompressed weight - weight = ModelWeightParameter(data=torch.empty( - sum(output_partition_sizes), - input_size_per_partition, - dtype=self.weights_dtype), - input_dim=1, - output_dim=0, - weight_loader=weight_loader) + weight = ModelWeightParameter( + data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=self.weights_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + if self.do_sparse_decompress: + assert all(partition_size % 8 == 0 + for partition_size in output_partition_sizes + ), "All partitions must be divisible by 8 for " + "2:4 sparse compressed models" + + shape = BasevLLMParameter( + data=torch.empty(2, 1, dtype=torch.int64), + weight_loader=weight_loader, + ) + compressed_weight = ModelWeightParameter( + data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition // 2, + dtype=self.weights_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + + bitmask = ModelWeightParameter( + data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition // 8, + dtype=torch.uint8, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + + layer.register_parameter("shape", shape) + layer.register_parameter("compressed", compressed_weight) + layer.register_parameter("bitmask", bitmask) # Check if quantized, not just 2:4 Sparse if self.quantized: @@ -68,14 +125,16 @@ class CompressedTensors24(CompressedTensorsScheme): data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32), output_dim=0, - weight_loader=weight_loader) + weight_loader=weight_loader, + ) else: assert (self.weight_quant and self.weight_quant.strategy == QuantizationStrategy.TENSOR.value) weight_scale = PerTensorScaleParameter( data=torch.empty(len(output_partition_sizes), dtype=torch.float32), - weight_loader=weight_loader) + weight_loader=weight_loader, + ) layer.register_parameter("weight_scale", weight_scale) @@ -84,9 +143,10 @@ class CompressedTensors24(CompressedTensorsScheme): # register input quant scale assert (self.input_quant.strategy == QuantizationStrategy.TENSOR.value) - input_scale = BasevLLMParameter(data=torch.empty( - 1, dtype=torch.float32), - weight_loader=weight_loader) + input_scale = BasevLLMParameter( + data=torch.empty(1, dtype=torch.float32), + weight_loader=weight_loader, + ) layer.register_parameter("input_scale", input_scale) @@ -107,13 +167,25 @@ class CompressedTensors24(CompressedTensorsScheme): """ Compress weights after loading. Store compressed weight and meta tensor - + :post-condition: layer.w_compressed and layer.meta are set to the compressed weight and meta tensor in the format expected by the Cutlass kernels :param layer: The layer with the weights to be processed - + """ + if self.do_sparse_decompress: + layer.weight.data = self._decompress_bitmask_compressed_weight( + compressed=layer.compressed, + bitmask=layer.bitmask, + layer=layer, + ) + + # compressed and bitmask tensors + # are no longer needed after decompression + del layer.compressed + del layer.bitmask + # torch.compile workaround if hasattr(layer, "input_scale"): layer.input_scale = torch.nn.Parameter(layer.input_scale.data, @@ -121,10 +193,13 @@ class CompressedTensors24(CompressedTensorsScheme): if self.weight_quant: if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value: - layer.weight_scale = torch.nn.Parameter(convert_to_channelwise( - weight_scale=layer.weight_scale, - logical_widths=layer.logical_widths), - requires_grad=False) + layer.weight_scale = torch.nn.Parameter( + convert_to_channelwise( + weight_scale=layer.weight_scale, + logical_widths=layer.logical_widths, + ), + requires_grad=False, + ) else: # torch.compile workaround layer.weight_scale = torch.nn.Parameter( @@ -134,20 +209,22 @@ class CompressedTensors24(CompressedTensorsScheme): layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False) layer.meta = torch.nn.Parameter(meta, requires_grad=False) - def apply_weights(self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: """ - Returns the output tensor for the layer with 2:4 + Returns the output tensor for the layer with 2:4 sparse compressed weights, given the input tensor and bias - :param layer: The layer with 2:4 sparse compressed + :param layer: The layer with 2:4 sparse compressed weights to be used for the computation :param x: The input tensor to the layer :param bias: The bias to be added to the output tensor - :return: The output tensor of the layer + :return: The output tensor of the layer """ if self.quantized: scale = None @@ -171,13 +248,15 @@ class CompressedTensors24(CompressedTensorsScheme): input_scale = layer.input_scale q_input = x - out = ops.cutlass_scaled_sparse_mm(a=q_input, - bt_nzs=layer.weight, - bt_meta=layer.meta, - scale_a=input_scale, - scale_b=layer.weight_scale, - out_dtype=self.output_dtype, - bias=bias) + out = ops.cutlass_scaled_sparse_mm( + a=q_input, + bt_nzs=layer.weight, + bt_meta=layer.meta, + scale_a=input_scale, + scale_b=layer.weight_scale, + out_dtype=self.output_dtype, + bias=bias, + ) assert out.is_contiguous() return out @@ -203,8 +282,71 @@ class CompressedTensors24(CompressedTensorsScheme): raise ValueError("Quantization type not supported by Cutlass") + def _decompress_bitmask_compressed_weight( + self, + compressed: torch.Tensor, + bitmask: torch.Tensor, + layer: torch.nn.Module, + ) -> torch.Tensor: + """ + Decompress a compressed 2:4 sparse weight tensor using the bitmask and + return the result. + + This function also supports sharded decompression. + + :param compressed: The 2:4 sparse weight tensor compressed using the + sparse-24-bitmask compressor. This is different from + `cutlass_sparse_compress` which uses a different scheme (2 bits for + every nonzero element that represent the coordinate within the block + of 4). The bitmask compression here uses a bitmask to indicate the + positions of non-zero elements. + :param bitmask: The 2:4 bitmask associated with the compressed weights, + representing the positions of non-zero elements in the compressed + tensor. + :param layer: The layer whose weights need to be processed after + loading. + :return: The decompressed 2:4 sparse weight tensor. + """ -def check_24(tensor): - new_tensor = tensor.view(-1, 4) - zero_counts = (new_tensor == 0).sum(dim=1) - return (zero_counts >= 2).all().item() + sparsity_compressor = self.model_compressor.sparsity_compressor + + def _process_split( + bitmask_compressed_weight: torch.Tensor, + shape, + bitmask: torch.Tensor, + ) -> torch.Tensor: + weight_data = dict( + compressed=bitmask_compressed_weight, + shape=shape, + bitmask=bitmask, + ) + return sparsity_compressor.decompress_weight(weight_data) + + split_weights: List[torch.Tensor] = [] + split_bitmask: List[torch.Tensor] = [] + split_shape: List[Tuple[int, int]] = [] + + if isinstance(layer, (QKVParallelLinear, MergedColumnParallelLinear)): + split_weights = torch.split(compressed, layer.logical_widths) + split_bitmask = torch.split(bitmask, layer.logical_widths) + split_shape = [(out, layer.input_size_per_partition) + for out in layer.logical_widths] + + if split_weights: + decompressed_shards = [ + _process_split(compressed_weight, shape, bitmask) + for compressed_weight, shape, bitmask in zip( + split_weights, split_shape, split_bitmask) + ] + decompressed = combine_shards(decompressed_shards) + else: + decompressed = sparsity_compressor.decompress_weight( + dict( + compressed=compressed, + shape=( + layer.logical_widths[0], + layer.input_size_per_partition, + ), + bitmask=bitmask, + )) + return decompressed -- GitLab From a4ce74c14a469b178c446a34ce48f158909a8e74 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 6 Feb 2025 05:30:46 +0800 Subject: [PATCH 54/65] [VLM] Use shared field to pass token ids to model --- vllm/model_executor/models/internvl.py | 6 +- vllm/multimodal/inputs.py | 275 +++++++++++++++++++++---- 2 files changed, 235 insertions(+), 46 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 08fc659ab..380eb40d9 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -564,8 +564,7 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): # Since there may be extra tokens in the feature placeholders, # we need to pass the image token ID to the model to select the # tokens to merge from the vision encoder outputs - processed_outputs["image_token_id"] = [image_token_id - ] * len(image_data) + processed_outputs["image_token_id"] = torch.tensor(image_token_id) return processed_outputs @@ -575,13 +574,14 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0)) + num_images = len(image_num_patches) return dict( pixel_values_flat=MultiModalFieldConfig.flat_from_sizes( "image", image_num_patches), image_num_patches=MultiModalFieldConfig.batched("image"), image_embeds=MultiModalFieldConfig.batched("image"), - image_token_id=MultiModalFieldConfig.batched("image"), + image_token_id=MultiModalFieldConfig.shared("image", num_images), ) def _get_prompt_replacements( diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 8e4af7f88..2f2535f36 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -4,6 +4,7 @@ from abc import ABC, abstractmethod from collections import UserDict, defaultdict from collections.abc import Mapping, Sequence from dataclasses import dataclass +from functools import partial from itertools import accumulate from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar, Union, cast, final) @@ -164,51 +165,112 @@ A dictionary containing nested tensors which have been batched via @dataclass(frozen=True) class MultiModalFieldElem: - """Contains metadata and data of an item in :class:`MultiModalKwargs`.""" - field: "BaseMultiModalField" + """ + Represents a keyword argument corresponding to a multi-modal item + in :class:`MultiModalKwargs`. + """ + + modality: str + """ + The modality of the corresponding multi-modal item. + Each multi-modal item can consist of multiple keyword arguments. + """ + + key: str + """ + The key of this field in :class:`MultiModalKwargs`, + i.e. the name of the keyword argument to be passed to the model. + """ + data: NestedTensors + """ + The tensor data of this field in :class:`MultiModalKwargs`, + i.e. the value of the keyword argument to be passed to the model. + """ + + field: "BaseMultiModalField" + """ + Defines how to combine the tensor data of this field with others + in order to batch multi-modal items together for model inference. + """ def __eq__(self, other: object) -> bool: if not isinstance(other, self.__class__): return False - return (self.field == other.field - and nested_tensors_equal(self.data, other.data)) + return ((self.modality, self.key) == (other.modality, other.key) + and nested_tensors_equal(self.data, other.data) + and type(self.field) == type(other.field)) # noqa: E721 @dataclass(frozen=True) class BaseMultiModalField(ABC): - """Abstract base class for a field in :class:`MultiModalKwargs`.""" - key: str - modality: str + """ + Defines how to interpret tensor data belonging to a keyword argument in + :class:`MultiModalKwargs` for multiple multi-modal items, and vice versa. + """ + + def _field_factory(self, *, modality: str, key: str): + f = partial( + MultiModalFieldElem, + modality=modality, + key=key, + field=self, + ) + + # Allow passing data as positional argument + def factory(data: NestedTensors) -> MultiModalFieldElem: + return f(data=data) + + return factory @abstractmethod - def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: + def build_elems( + self, + modality: str, + key: str, + data: NestedTensors, + ) -> Sequence[MultiModalFieldElem]: + """ + Construct :class:`MultiModalFieldElem` instances to represent + the provided data. + + This is the inverse of :meth:`reduce_data`. + """ raise NotImplementedError - def _build_elem(self, data: NestedTensors) -> MultiModalFieldElem: - return MultiModalFieldElem(self, data) + @abstractmethod + def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: + raise NotImplementedError - def reduce(self, batch: list[MultiModalFieldElem]) -> MultiModalFieldElem: - """Merge multiple instances of :class:`MultiModalFieldElem` together.""" - fields = [item.field for item in batch] - if len(set(fields)) > 1: - raise ValueError(f"Cannot merge different {fields=}") + def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors: + """ + Merge the data from multiple instances of :class:`MultiModalFieldElem`. - data = self._reduce_data([item.data for item in batch]) + This is the inverse of :meth:`build_elems`. + """ + field_types = [type(item.field) for item in elems] + if len(set(field_types)) > 1: + raise ValueError(f"Cannot merge different {field_types=}") - return self._build_elem(data) + return self._reduce_data([item.data for item in elems]) @dataclass(frozen=True) class MultiModalBatchedField(BaseMultiModalField): """ - A :class:`BaseMultiModalField` implementation where an element in the batch - is obtained by indexing into the first dimension of the underlying data. + See also: + :func:`MultiModalFieldConfig.batched` """ - def build_elems(self, batch: NestedTensors) -> list[MultiModalFieldElem]: - return [self._build_elem(item) for item in batch] + def build_elems( + self, + modality: str, + key: str, + data: NestedTensors, + ) -> Sequence[MultiModalFieldElem]: + field_factory = self._field_factory(modality=modality, key=key) + return [field_factory(item) for item in data] def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): @@ -227,16 +289,20 @@ class MultiModalBatchedField(BaseMultiModalField): @dataclass(frozen=True) class MultiModalFlatField(BaseMultiModalField): """ - A :class:`BaseMultiModalField` implementation where an element in the batch - is obtained by slicing along the first dimension of the underlying data. + See also: + :func:`MultiModalFieldConfig.flat` + :func:`MultiModalFieldConfig.flat_from_sizes` """ + slices: Sequence[slice] def build_elems( self, - batch: NestedTensors, - slices: Sequence[slice], - ) -> list[MultiModalFieldElem]: - return [self._build_elem(batch[slice_]) for slice_ in slices] + modality: str, + key: str, + data: NestedTensors, + ) -> Sequence[MultiModalFieldElem]: + field_factory = self._field_factory(modality=modality, key=key) + return [field_factory(data[s]) for s in self.slices] def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): @@ -252,25 +318,121 @@ class MultiModalFlatField(BaseMultiModalField): return [e for elem in batch for e in elem] +@dataclass(frozen=True) +class MultiModalSharedField(BaseMultiModalField): + """ + See also: + :func:`MultiModalFieldConfig.shared` + """ + batch_size: int + + def build_elems( + self, + modality: str, + key: str, + data: NestedTensors, + ) -> Sequence[MultiModalFieldElem]: + field_factory = self._field_factory(modality=modality, key=key) + return [field_factory(data)] * self.batch_size + + def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: + return batch[0] + + class MultiModalFieldConfig: @staticmethod def batched(modality: str): + """ + Defines a field where an element in the batch is obtained by + indexing into the first dimension of the underlying data. + + Args: + modality: The modality of the multi-modal item that uses this + keyword argument. + + Example: + + .. code-block:: + + Input: + Data: [[AAAA] + [BBBB] + [CCCC]] + + Output: + Element 1: [AAAA] + Element 2: [BBBB] + Element 3: [CCCC] + """ return MultiModalFieldConfig( - field_cls=MultiModalBatchedField, + field=MultiModalBatchedField(), modality=modality, ) @staticmethod def flat(modality: str, slices: Sequence[slice]): + """ + Defines a field where an element in the batch is obtained by + slicing along the first dimension of the underlying data. + + Args: + modality: The modality of the multi-modal item that uses this + keyword argument. + slices: For each multi-modal item, a slice that is used to extract + the data corresponding to it. + + Example: + + .. code-block:: + + Given: + slices: [slice(0, 3), slice(3, 7), slice(7, 9)] + + Input: + Data: [AAABBBBCC] + + Output: + Element 1: [AAA] + Element 2: [BBBB] + Element 3: [CC] + """ return MultiModalFieldConfig( - field_cls=MultiModalFlatField, + field=MultiModalFlatField(slices=slices), modality=modality, - slices=slices, ) @staticmethod def flat_from_sizes(modality: str, size_per_item: torch.Tensor): + """ + Defines a field where an element in the batch is obtained by + slicing along the first dimension of the underlying data. + + Args: + modality: The modality of the multi-modal item that uses this + keyword argument. + slices: For each multi-modal item, the size of the slice that + is used to extract the data corresponding to it. + + Example: + + .. code-block:: + + Given: + size_per_item: [3, 4, 2] + + Input: + Data: [AAABBBBCC] + + Output: + Element 1: [AAA] + Element 2: [BBBB] + Element 3: [CC] + + See also: + :func:`MultiModalFieldConfig.flat` + """ + slice_idxs = [0, *accumulate(size_per_item)] slices = [ slice(slice_idxs[i], slice_idxs[i + 1]) @@ -279,25 +441,52 @@ class MultiModalFieldConfig: return MultiModalFieldConfig.flat(modality, slices) - def __init__( - self, - field_cls: type[BaseMultiModalField], - modality: str, - **field_config: Any, - ) -> None: + @staticmethod + def shared(modality: str, batch_size: int): + """ + Defines a field where an element in the batch is obtained by + taking the entirety of the underlying data. + + This means that the data is the same for each element in the batch. + + Args: + modality: The modality of the multi-modal item that uses this + keyword argument. + batch_size: The number of multi-modal items which share this data. + + Example: + + .. code-block:: + + Given: + batch_size: 4 + + Input: + Data: [XYZ] + + Output: + Element 1: [XYZ] + Element 2: [XYZ] + Element 3: [XYZ] + Element 4: [XYZ] + """ + return MultiModalFieldConfig( + field=MultiModalSharedField(batch_size), + modality=modality, + ) + + def __init__(self, field: BaseMultiModalField, modality: str) -> None: super().__init__() - self.field_cls = field_cls + self.field = field self.modality = modality - self.field_config = field_config def build_elems( self, key: str, batch: NestedTensors, ) -> Sequence[MultiModalFieldElem]: - field = self.field_cls(key=key, modality=self.modality) - return field.build_elems(batch, **self.field_config) # type: ignore + return self.field.build_elems(self.modality, key, batch) class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): @@ -308,11 +497,11 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): @staticmethod def from_elems(elems: Sequence[MultiModalFieldElem]): - return MultiModalKwargsItem({elem.field.key: elem for elem in elems}) + return MultiModalKwargsItem({elem.key: elem for elem in elems}) @property def modality(self) -> str: - modalities = {elem.field.modality for elem in self.data.values()} + modalities = {elem.modality for elem in self.data.values()} assert len(modalities) == 1, f"Found different modalities={modalities}" return next(iter(modalities)) @@ -372,7 +561,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): elems_by_key[key].append(elem) data = { - key: elems[0].field.reduce(elems).data + key: elems[0].field.reduce_data(elems) for key, elems in elems_by_key.items() if len(elems) > 0 } -- GitLab From 9a5b1554b4f049aad6398bb29d3064138ac9a039 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 5 Feb 2025 16:30:50 -0500 Subject: [PATCH 55/65] [Docs] Drop duplicate [source] links --- docs/source/conf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index ea3b56e02..f4e8c8b94 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -37,7 +37,6 @@ author = 'the vLLM Team' # ones. extensions = [ "sphinx.ext.napoleon", - "sphinx.ext.viewcode", "sphinx.ext.linkcode", "sphinx.ext.intersphinx", "sphinx_copybutton", -- GitLab From bf3b79efb82676219a3275764d8fcf4c70097ce5 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Wed, 5 Feb 2025 13:31:38 -0800 Subject: [PATCH 56/65] [VLM] Qwen2.5-VL --- docs/source/models/supported_models.md | 11 + examples/offline_inference/vision_language.py | 31 + .../vision_language_multi_image.py | 58 + .../vision_language/test_models.py | 22 + .../multimodal/processing/test_common.py | 1 + tests/models/registry.py | 2 + vllm/entrypoints/chat_utils.py | 4 +- .../model_executor/layers/rotary_embedding.py | 58 +- vllm/model_executor/models/qwen2_5_vl.py | 1133 +++++++++++++++++ vllm/model_executor/models/qwen2_vl.py | 16 +- vllm/model_executor/models/registry.py | 1 + vllm/v1/worker/gpu_model_runner.py | 12 +- vllm/worker/cpu_model_runner.py | 9 +- vllm/worker/model_runner.py | 9 +- 14 files changed, 1315 insertions(+), 52 deletions(-) create mode 100644 vllm/model_executor/models/qwen2_5_vl.py diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index d8e284292..3e8b2f896 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -846,6 +846,13 @@ See [this page](#generative-models) for more information on how to use generativ * ✅︎ * ✅︎ * ✅︎ +- * `Qwen2_5_VLForConditionalGeneration` + * Qwen2.5-VL + * T + IE+ + VE+ + * `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. + * + * ✅︎ + * ✅︎ - * `UltravoxModel` * Ultravox * T + AE+ @@ -880,6 +887,10 @@ The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingf A corrected version is available at . ::: +:::{note} +To use Qwen2.5-VL series models, you have to install Huggingface `transformers` library from source via `pip install git+https://github.com/huggingface/transformers`. +::: + ### Pooling Models See [this page](pooling-models) for more information on how to use pooling models. diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 65940b6ad..436c36570 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -531,6 +531,36 @@ def run_qwen2_vl(question: str, modality: str): return llm, prompt, stop_token_ids +# Qwen2.5-VL +def run_qwen2_5_vl(question: str, modality: str): + + model_name = "Qwen/Qwen2.5-VL-3B-Instruct" + + llm = LLM( + model=model_name, + max_model_len=4096, + max_num_seqs=5, + mm_processor_kwargs={ + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, + "fps": 1, + }, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + if modality == "image": + placeholder = "<|image_pad|>" + elif modality == "video": + placeholder = "<|video_pad|>" + + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = None + return llm, prompt, stop_token_ids + + model_example_map = { "aria": run_aria, "blip-2": run_blip2, @@ -557,6 +587,7 @@ model_example_map = { "pixtral_hf": run_pixtral_hf, "qwen_vl": run_qwen_vl, "qwen2_vl": run_qwen2_vl, + "qwen2_5_vl": run_qwen2_5_vl, } diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 601ac96e1..8d2172a60 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -392,6 +392,63 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: ) +def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData: + try: + from qwen_vl_utils import process_vision_info + except ModuleNotFoundError: + print('WARNING: `qwen-vl-utils` not installed, input images will not ' + 'be automatically resized. You can enable this functionality by ' + '`pip install qwen-vl-utils`.') + process_vision_info = None + + model_name = "Qwen/Qwen2.5-VL-3B-Instruct" + + llm = LLM( + model=model_name, + max_model_len=32768 if process_vision_info is None else 4096, + max_num_seqs=5, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": + "user", + "content": [ + *placeholders, + { + "type": "text", + "text": question + }, + ], + }] + + processor = AutoProcessor.from_pretrained(model_name) + + prompt = processor.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + stop_token_ids = None + + if process_vision_info is None: + image_data = [fetch_image(url) for url in image_urls] + else: + image_data, _ = process_vision_info(messages, + return_video_sample_fps=False) + + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=stop_token_ids, + image_data=image_data, + chat_template=None, + ) + + model_example_map = { "aria": load_aria, "deepseek_vl_v2": load_deepseek_vl2, @@ -404,6 +461,7 @@ model_example_map = { "pixtral_hf": load_pixtral_hf, "qwen_vl_chat": load_qwen_vl_chat, "qwen2_vl": load_qwen2_vl, + "qwen2_5_vl": load_qwen2_5_vl, } diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 85bc4ac13..95505dcf5 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -121,6 +121,8 @@ VLM_TEST_SETTINGS = { else ("half", "float")), marks=[pytest.mark.core_model], ), + # TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL + # once we upgraded to transformers>=4.49.0. "qwen2_vl": VLMTestInfo( models=["Qwen/Qwen2-VL-2B-Instruct"], test_type=( @@ -138,6 +140,26 @@ VLM_TEST_SETTINGS = { image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), + "qwen2_5_vl": VLMTestInfo( + models=["Qwen/Qwen2.5-VL-3B-Instruct"], + test_type=( + VLMTestType.IMAGE, + VLMTestType.MULTI_IMAGE, + VLMTestType.VIDEO + ), + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 + video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModelForVision2Seq, + vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, + image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + marks=[pytest.mark.skipif( + TRANSFORMERS_VERSION < "4.49.0", + reason="HF model requires transformers>=4.49.0", + ), pytest.mark.core_model, pytest.mark.cpu_model], + ), #### Extended model tests "aria": VLMTestInfo( models=["rhymes-ai/Aria"], diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 5cd749cbd..77cf3442d 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -161,6 +161,7 @@ def _test_processing_correctness( "nvidia/NVLM-D-72B", "Qwen/Qwen-VL-Chat", "Qwen/Qwen2-VL-2B-Instruct", + "Qwen/Qwen2.5-VL-3B-Instruct", "Qwen/Qwen2-Audio-7B-Instruct", "fixie-ai/ultravox-v0_3", ]) diff --git a/tests/models/registry.py b/tests/models/registry.py index 285fbe484..20787fe00 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -264,6 +264,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True), "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 + "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501 + min_transformers_version="4.49"), # noqa: E501 "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3", trust_remote_code=True), # [Encoder-decoder] diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 3a6e75b1d..f04902ae1 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -410,7 +410,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): return "" if model_type == "mllama": return "<|image|>" - if model_type == "qwen2_vl": + if model_type in ("qwen2_vl", "qwen2_5_vl"): return "<|vision_start|><|image_pad|><|vision_end|>" if model_type == "molmo": return "" @@ -430,7 +430,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): return "()" raise TypeError(f"Unknown model type: {model_type}") elif modality == "video": - if model_type == "qwen2_vl": + if model_type in ("qwen2_vl", "qwen2_5_vl"): return "<|vision_start|><|video_pad|><|vision_end|>" if model_type in ("minicpmo", "minicpmv"): return "()" diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 814c3b7d9..b3b9b0e87 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -27,6 +27,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union import torch import torch.nn as nn +from transformers import PretrainedConfig from vllm.model_executor.custom_op import CustomOp @@ -772,8 +773,12 @@ class MRotaryEmbedding(RotaryEmbedding): dtype: torch.dtype, mrope_section: Optional[List[int]] = None, ) -> None: - super().__init__(head_size, rotary_dim, max_position_embeddings, base, - is_neox_style, dtype) + # In Qwen2.5-VL, the maximum index value is related to the duration of + # the input video. We enlarge max_position_embeddings to 4 times to get + # a larger the cos and sin cache. + self.cache_max_position_num = max_position_embeddings * 4 + super().__init__(head_size, rotary_dim, self.cache_max_position_num, + base, is_neox_style, dtype) self.mrope_section = mrope_section if self.mrope_section: @@ -831,13 +836,10 @@ class MRotaryEmbedding(RotaryEmbedding): @staticmethod def get_input_positions( input_tokens: List[int], + hf_config: PretrainedConfig, image_grid_thw: Union[List[List[int]], torch.Tensor], video_grid_thw: Union[List[List[int]], torch.Tensor], - image_token_id: int, - video_token_id: int, - vision_start_token_id: int, - vision_end_token_id: int, - spatial_merge_size: int, + second_per_grid_ts: Optional[List[float]] = None, context_len: int = 0, seq_len: Optional[int] = None, ) -> Tuple[List[List[int]], int]: @@ -845,16 +847,13 @@ class MRotaryEmbedding(RotaryEmbedding): llm_positions, mrope_position_delta = \ MRotaryEmbedding.get_input_positions_tensor( - input_tokens, - image_grid_thw, - video_grid_thw, - image_token_id, - video_token_id, - vision_start_token_id, - vision_end_token_id, - spatial_merge_size, - context_len, - seq_len, + input_tokens=input_tokens, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + context_len=context_len, + seq_len=seq_len, ) return llm_positions.tolist(), mrope_position_delta @@ -862,18 +861,22 @@ class MRotaryEmbedding(RotaryEmbedding): @staticmethod def get_input_positions_tensor( input_tokens: List[int], + hf_config: PretrainedConfig, image_grid_thw: Union[List[List[int]], torch.Tensor], video_grid_thw: Union[List[List[int]], torch.Tensor], - image_token_id: int, - video_token_id: int, - vision_start_token_id: int, - vision_end_token_id: int, - spatial_merge_size: int, + second_per_grid_ts: Optional[List[float]] = None, context_len: int = 0, seq_len: Optional[int] = None, ) -> Tuple[torch.Tensor, int]: """Get mrope input positions and delta value.""" + image_token_id = hf_config.image_token_id + video_token_id = hf_config.video_token_id + vision_start_token_id = hf_config.vision_start_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + tokens_per_second = getattr(hf_config.vision_config, + "tokens_per_second", 1.0) + if isinstance(image_grid_thw, torch.Tensor): image_grid_thw = image_grid_thw.tolist() if isinstance(video_grid_thw, torch.Tensor): @@ -892,6 +895,7 @@ class MRotaryEmbedding(RotaryEmbedding): image_index, video_index = 0, 0 for _ in range(image_nums + video_nums): + video_second_per_grid_t = 0.0 if image_token_id in input_tokens and remain_images > 0: ed_image = input_tokens.index(image_token_id, st) else: @@ -915,9 +919,13 @@ class MRotaryEmbedding(RotaryEmbedding): video_grid_thw[video_index][1], video_grid_thw[video_index][2], ) + video_second_per_grid_t = 1.0 + if second_per_grid_ts is not None: + video_second_per_grid_t = second_per_grid_ts[video_index] video_index += 1 remain_videos -= 1 ed = ed_video + llm_grid_t, llm_grid_h, llm_grid_w = \ t, h // spatial_merge_size, w // spatial_merge_size text_len = ed - st @@ -927,8 +935,10 @@ class MRotaryEmbedding(RotaryEmbedding): llm_pos_ids_list.append( torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) - t_index = torch.arange(llm_grid_t).view(-1, 1).expand( - -1, llm_grid_h * llm_grid_w).flatten() + t_index = (torch.arange(llm_grid_t).view(-1, 1).expand( + -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t * + tokens_per_second).long().flatten() + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand( llm_grid_t, -1, llm_grid_w).flatten() w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand( diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py new file mode 100644 index 000000000..e93cf46b9 --- /dev/null +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -0,0 +1,1133 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +# Copyright 2025 The vLLM team. +# Copyright 2025 The Qwen Team. +# Copyright 2025 The HuggingFace Inc. team. +# All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Qwen2.5-VL model compatible with HuggingFace weights.""" +from functools import cached_property, partial +from typing import (Callable, Iterable, List, Literal, Mapping, Optional, Set, + Tuple, TypedDict, Union) + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange +from transformers import BatchFeature +from transformers.models.qwen2_5_vl import (Qwen2_5_VLImageProcessor, + Qwen2_5_VLProcessor) +from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( + Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig) + +from vllm.attention import AttentionMetadata +from vllm.config import VllmConfig +from vllm.distributed import parallel_state +from vllm.distributed import utils as dist_utils +from vllm.logger import init_logger +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.gptq import GPTQConfig +from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQMarlinConfig) +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalFieldConfig +from vllm.platforms import _Backend +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import uses_mrope + +from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP +from .qwen2_vl import Qwen2VLDummyInputsBuilder as Qwen2_5_VLDummyInputsBuilder +from .qwen2_vl import (Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo, + apply_rotary_pos_emb_vision) +from .utils import (AutoWeightsLoader, WeightsMapper, + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) +from .vision import get_vit_attn_backend + +logger = init_logger(__name__) + +# === Vision Inputs === # + + +class Qwen2_5_VLImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + pixel_values: torch.Tensor + """Shape: + `(num_patches, num_channels * patch_size * patch_size)` + """ + + image_grid_thw: torch.Tensor + """Shape: `(num_images, 3)` + This should be in `(grid_t, grid_h, grid_w)` format. + """ + + +class Qwen2_5_VLImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + image_embeds: torch.Tensor + """Supported types: + - List[`torch.Tensor`]: A list of tensors holding all images' features. + Each tensor holds an image's features. + - `torch.Tensor`: A tensor holding all images' features + (concatenation of all images' feature tensors). + + Tensor shape: `(num_image_features, hidden_size)` + - `num_image_features` varies based on + the number and resolution of the images. + - `hidden_size` must match the hidden size of language model backbone. + """ + + image_grid_thw: torch.Tensor + """Shape: `(num_images, 3)` + This should be in `(grid_t, grid_h, grid_w)` format. + """ + + +Qwen2_5_VLImageInputs = Union[Qwen2_5_VLImagePixelInputs, + Qwen2_5_VLImageEmbeddingInputs] + + +class Qwen2_5_VLVideoPixelInputs(TypedDict): + type: Literal["pixel_values_videos"] + pixel_values_videos: torch.Tensor + """Shape: + `(num_patches, + num_channels * temporal_patch_size * patch_size * patch_size)` + """ + + video_grid_thw: torch.Tensor + """Shape: `(num_videos, 3)` + + This should be in `(grid_t, grid_h, grid_w)` format. + """ + + second_per_grid_ts: torch.Tensor + """ + The video time interval (in seconds) for each grid along the temporal + dimension in the 3D position IDs. Returned when `videos` is not `None`. + """ + + +class Qwen2_5_VLVideoEmbeddingInputs(TypedDict): + type: Literal["video_embeds"] + video_embeds: torch.Tensor + """Supported types: + - List[`torch.Tensor`]: A list of tensors holding all videos' features. + Each tensor holds an video's features. + - `torch.Tensor`: A tensor holding all videos' features + (concatenation of all videos' feature tensors). + + Tensor shape: `(num_image_features, hidden_size)` + - `num_image_features` varies based on + the number and resolution of the videos. + - `hidden_size` must match the hidden size of language model backbone. + """ + + video_grid_thw: torch.Tensor + """Shape: `(num_videos, 3)` + This should be in `(grid_t, grid_h, grid_w)` format. + """ + + +Qwen2_5_VLVideoInputs = Union[Qwen2_5_VLVideoPixelInputs, + Qwen2_5_VLVideoEmbeddingInputs] + +# === Vision Encoder === # + + +class Qwen2_5_VisionMLP(nn.Module): + + def __init__(self, + in_features: int, + hidden_features: int, + bias: bool = False, + act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + self.gate_proj = ColumnParallelLinear(in_features, + hidden_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.gate_proj") + self.up_proj = ColumnParallelLinear(in_features, + hidden_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.up_proj") + self.down_proj = RowParallelLinear(hidden_features, + in_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") + self.act_fn = act_fn + + def forward(self, x: torch.Tensor): + x_gate, _ = self.gate_proj(x) + x_gate = self.act_fn(x_gate) + x_up, _ = self.up_proj(x) + x_down, _ = self.down_proj(x_gate * x_up) + return x_down + + +class Qwen2_5_VisionAttention(nn.Module): + + def __init__( + self, + embed_dim: int, + num_heads: int, + projection_size: int, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + # Per attention head and per partition values. + world_size = parallel_state.get_tensor_model_parallel_world_size() + self.hidden_size_per_attention_head = dist_utils.divide( + projection_size, num_heads) + self.num_attention_heads_per_partition = dist_utils.divide( + num_heads, world_size) + + self.qkv = ColumnParallelLinear(input_size=embed_dim, + output_size=3 * projection_size, + quant_config=quant_config, + prefix=f"{prefix}.qkv") + self.proj = RowParallelLinear(input_size=projection_size, + output_size=embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.proj") + + # Detect attention implementation. + self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) + if self.attn_backend not in { + _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS + }: + raise RuntimeError( + f"Qwen2.5-VL does not support {self.attn_backend} backend now." + ) + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor, + ) -> torch.Tensor: + # [s, b, c] --> [s, b, head * 3 * head_dim] + x, _ = self.qkv(x) + + # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim] + new_x_shape = x.size()[:-1] + ( + self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head, + ) + x = x.view(*new_x_shape) + + # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim] + q, k, v = dist_utils.split_tensor_along_last_dim(x, 3) + batch_size = q.shape[1] + + q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() + for x in (q, k, v)) + if rotary_pos_emb is not None: + q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) + k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) + + if self.attn_backend == _Backend.FLASH_ATTN: + # from vllm_flash_attn.flash_attn_interface import ( + # flash_attn_varlen_func) + from flash_attn import flash_attn_varlen_func + + q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) + + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + output = flash_attn_varlen_func(q, + k, + v, + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=max_seqlen, + max_seqlen_k=max_seqlen, + dropout_p=0, + causal=False) + + context_layer = rearrange(output, + "(b s) ... -> b s ...", + b=batch_size) + elif self.attn_backend == _Backend.TORCH_SDPA: + seq_length = q.size(1) + q, k, v = (rearrange(x, "b s h d -> b h s d") for x in [q, k, v]) + attention_mask = torch.zeros([1, seq_length, seq_length], + device=q.device, + dtype=torch.bool) + for i in range(1, len(cu_seqlens)): + attention_mask[..., cu_seqlens[i - 1]:cu_seqlens[i], + cu_seqlens[i - 1]:cu_seqlens[i]] = True + output = F.scaled_dot_product_attention(q, + k, + v, + attention_mask, + dropout_p=0.0) + context_layer = rearrange(output, "b h s d -> b s h d ") + elif self.attn_backend == _Backend.XFORMERS: + from xformers import ops as xops + from xformers.ops.fmha.attn_bias import BlockDiagonalMask + + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, + kv_seqlen=None) + + context_layer = xops.memory_efficient_attention_forward( + q, k, v, attn_bias=attn_bias, p=0, scale=None) + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() + + output, _ = self.proj(context_layer) + return output + + +class Qwen2RMSNorm(nn.Module): + + def __init__(self, hidden_size, eps=1e-6): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class Qwen2_5_VisionBlock(nn.Module): + + def __init__( + self, + dim: int, + num_heads: int, + mlp_hidden_dim: int, + act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, + norm_layer: Optional[Callable[[int], nn.Module]] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.norm1 = norm_layer(dim) + self.norm2 = norm_layer(dim) + self.attn = Qwen2_5_VisionAttention(embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn") + self.mlp = Qwen2_5_VisionMLP(dim, + mlp_hidden_dim, + act_fn=act_fn, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + + def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor) -> torch.Tensor: + x = x + self.attn(self.norm1(x), + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb) + x = x + self.mlp(self.norm2(x)) + return x + + +class Qwen2_5_VisionPatchEmbed(nn.Module): + + def __init__( + self, + patch_size: int = 14, + temporal_patch_size: int = 2, + in_channels: int = 3, + hidden_size: int = 1152, + ) -> None: + super().__init__() + self.patch_size = patch_size + self.temporal_patch_size = temporal_patch_size + self.hidden_size = hidden_size + + kernel_size = (temporal_patch_size, patch_size, patch_size) + self.proj = nn.Conv3d(in_channels, + hidden_size, + kernel_size=kernel_size, + stride=kernel_size, + bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + L, C = x.shape + x = x.view(L, -1, self.temporal_patch_size, self.patch_size, + self.patch_size) + x = self.proj(x).view(L, self.hidden_size) + return x + + +class Qwen2_5_VisionPatchMerger(nn.Module): + + def __init__( + self, + d_model: int, + context_dim: int, + norm_layer: Optional[Callable[[int], nn.Module]] = None, + spatial_merge_size: int = 2, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = context_dim * (spatial_merge_size**2) + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.ln_q = norm_layer(context_dim) + self.mlp = nn.ModuleList([ + ColumnParallelLinear(self.hidden_size, + self.hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp.0"), + nn.GELU(), + RowParallelLinear(self.hidden_size, + d_model, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp.2"), + ]) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.ln_q(x) + x = x.view(-1, self.hidden_size) + + mlp_fc1, mlp_act, mlp_fc2 = self.mlp + x_parallel, _ = mlp_fc1(x) + x_parallel = mlp_act(x_parallel) + out, _ = mlp_fc2(x_parallel) + return out + + +class Qwen2_5_VisionRotaryEmbedding(nn.Module): + + def __init__(self, dim: int, theta: float = 10000.0) -> None: + super().__init__() + self.dim = dim + self.theta = theta + inv_freq = 1.0 / (theta + **(torch.arange(0, dim, 2, dtype=torch.float) / dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._freqs_cached = None + + def update_freqs_cache(self, seqlen: int) -> None: + if seqlen > self._seq_len_cached: + seqlen *= 2 + self._seq_len_cached = seqlen + self.inv_freq = 1.0 / (self.theta**(torch.arange( + 0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device) + / self.dim)) + seq = torch.arange(seqlen, + device=self.inv_freq.device, + dtype=self.inv_freq.dtype) + freqs = torch.outer(seq, self.inv_freq) + self._freqs_cached = freqs + + def forward(self, seqlen: int) -> torch.Tensor: + self.update_freqs_cache(seqlen) + return self._freqs_cached[:seqlen] + + +class Qwen2_5_VisionTransformer(nn.Module): + + def __init__( + self, + vision_config: Qwen2_5_VLVisionConfig, + norm_eps: float = 1e-6, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + + patch_size = vision_config.patch_size + temporal_patch_size = vision_config.temporal_patch_size + in_channels = vision_config.in_channels + depth = vision_config.depth + self.hidden_size = vision_config.hidden_size + self.num_heads = vision_config.num_heads + + # args for get_window_index + self.window_size = vision_config.window_size + self.patch_size = vision_config.patch_size + self.spatial_merge_size = vision_config.spatial_merge_size + self.fullatt_block_indexes = vision_config.fullatt_block_indexes + self.spatial_merge_unit = self.spatial_merge_size**2 + + self.patch_embed = Qwen2_5_VisionPatchEmbed( + patch_size=patch_size, + temporal_patch_size=temporal_patch_size, + in_channels=in_channels, + hidden_size=self.hidden_size, + ) + + # NOTE: We use torch native RMSNorm here for precision purposes. + norm_layer = partial(Qwen2RMSNorm, eps=norm_eps) + head_dim = self.hidden_size // self.num_heads + self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2) + + self.blocks = nn.ModuleList([ + Qwen2_5_VisionBlock( + dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}") + for layer_idx in range(depth) + ]) + self.merger = Qwen2_5_VisionPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=self.hidden_size, + norm_layer=norm_layer, + spatial_merge_size=self.spatial_merge_size, + quant_config=quant_config, + prefix=f"{prefix}.merger", + ) + + @property + def dtype(self) -> torch.dtype: + return self.patch_embed.proj.weight.dtype + + @property + def device(self) -> torch.device: + return self.patch_embed.proj.weight.device + + def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + pos_ids.append( + torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb + + def get_window_index(self, grid_thw): + window_index: list = [] + cu_window_seqlens: list = [0] + window_index_id = 0 + vit_merger_window_size = (self.window_size // + self.spatial_merge_size // self.patch_size) + + for grid_t, grid_h, grid_w in grid_thw: + llm_grid_h = grid_h // self.spatial_merge_size + llm_grid_w = grid_w // self.spatial_merge_size + index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape( + grid_t, llm_grid_h, llm_grid_w) + pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size + pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size + num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size + num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size + index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100) + index_padded = index_padded.reshape(grid_t, num_windows_h, + vit_merger_window_size, + num_windows_w, + vit_merger_window_size) + index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( + grid_t, num_windows_h * num_windows_w, vit_merger_window_size, + vit_merger_window_size) + seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) + index_padded = index_padded.reshape(-1) + index_new = index_padded[index_padded != -100] + window_index.append(index_new + window_index_id) + cu_seqlens_tmp = seqlens.cumsum( + 0) * self.spatial_merge_unit + cu_window_seqlens[-1] + cu_window_seqlens.extend(cu_seqlens_tmp.tolist()) + window_index_id += (grid_t * llm_grid_h * llm_grid_w).item() + window_index = torch.cat(window_index, dim=0) + return window_index, cu_window_seqlens + + def forward( + self, + x: torch.Tensor, + grid_thw: torch.Tensor, + ) -> torch.Tensor: + # patchify + hidden_states = x.to(device=self.device, dtype=self.dtype) + hidden_states = self.patch_embed(hidden_states) + + # compute position embedding + rotary_pos_emb = self.rot_pos_emb(grid_thw) + + # windows attention + window_index, cu_window_seqlens = self.get_window_index(grid_thw) + cu_window_seqlens = torch.tensor( + cu_window_seqlens, + device=hidden_states.device, + dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) + cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) + seq_len, _ = hidden_states.size() + hidden_states = hidden_states.reshape( + seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) + hidden_states = hidden_states[window_index, :, :] + hidden_states = hidden_states.reshape(seq_len, -1) + rotary_pos_emb = rotary_pos_emb.reshape( + seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) + rotary_pos_emb = rotary_pos_emb[window_index, :, :] + rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) + # compute cu_seqlens + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], + grid_thw[:, 0]).cumsum( + dim=0, dtype=torch.int32) + cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0) + + # transformers + hidden_states = hidden_states.unsqueeze(1) + for layer_num, blk in enumerate(self.blocks): + if layer_num in self.fullatt_block_indexes: + cu_seqlens_now = cu_seqlens + else: + cu_seqlens_now = cu_window_seqlens + hidden_states = blk(hidden_states, + cu_seqlens=cu_seqlens_now, + rotary_pos_emb=rotary_pos_emb) + + # adapter + hidden_states = self.merger(hidden_states) + reverse_indices = torch.argsort(window_index) + hidden_states = hidden_states[reverse_indices, :] + return hidden_states + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() + + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if name.endswith("qkv.weight"): + visual_num_heads = self.num_heads + visual_embed_dim = self.hidden_size + head_size = visual_embed_dim // visual_num_heads + loaded_weight = loaded_weight.view(3, visual_num_heads, + head_size, + visual_embed_dim) + loaded_weight = loaded_weight.transpose(0, 1) + loaded_weight = loaded_weight.reshape(-1, visual_embed_dim) + elif name.endswith("qkv.bias"): + visual_num_heads = self.num_heads + visual_embed_dim = self.hidden_size + head_size = visual_embed_dim // visual_num_heads + loaded_weight = loaded_weight.view(3, visual_num_heads, + head_size) + loaded_weight = loaded_weight.transpose(0, 1) + loaded_weight = loaded_weight.reshape(-1) + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config(Qwen2_5_VLConfig) + + def get_hf_processor( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + fps: Optional[float] = 2.0, + ) -> Qwen2_5_VLProcessor: + hf_processor = self.ctx.get_hf_processor(Qwen2_5_VLProcessor) + image_processor = hf_processor.image_processor # type: ignore + assert isinstance(image_processor, Qwen2_5_VLImageProcessor) + + if min_pixels: + image_processor.min_pixels = min_pixels + if max_pixels: + image_processor.max_pixels = max_pixels + if max_pixels or min_pixels: + image_processor.size = { + "min_pixels": image_processor.min_pixels, + "max_pixels": image_processor.max_pixels, + } + + return hf_processor + + def get_image_processor( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + fps: Optional[float] = 2.0, + ) -> Qwen2_5_VLImageProcessor: + hf_processor = self.get_hf_processor( + min_pixels=min_pixels, + max_pixels=max_pixels, + fps=fps, + ) + image_processor = hf_processor.image_processor # type: ignore + assert isinstance(image_processor, Qwen2_5_VLImageProcessor) + return image_processor + + +class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor): + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + **super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs), + second_per_grid_ts=MultiModalFieldConfig.batched("video"), + ) + + +@MULTIMODAL_REGISTRY.register_processor( + Qwen2_5_VLMultiModalProcessor, + info=Qwen2_5_VLProcessingInfo, + dummy_inputs=Qwen2_5_VLDummyInputsBuilder) +class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsLoRA, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ] + } + + # LoRA specific attributes, TODO: double check + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", + "gate_proj" + "up_proj", + # vision tower + "qkv", + "attn.proj", # Distinguish patch_embed.proj + "fc1", + "fc2", + # projector + "mlp.0", + "mlp.2" + ] + embedding_modules = {} + embedding_padding_modules = [] + + # To ensure correct weight loading and mapping. + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ + "lm_head.": "language_model.lm_head.", + "model.": "language_model.model.", + }) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + + self.visual = Qwen2_5_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=self._maybe_ignore_quant_config(quant_config), + prefix=maybe_prefix(prefix, "visual"), + ) + + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model"), + architectures=["Qwen2ForCausalLM"], + ) + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors) + + @cached_property + def sampler(self): + if hasattr(self.language_model, "sampler"): + return self.language_model.sampler + + return get_sampler() + + def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): + # GPTQ configs do not have a list of ignored modules, however AutoGPTQ + # seems to avoid vision encoder sections for some models. + if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): + return None + return quant_config + + def _validate_and_reshape_mm_tensor(self, mm_input: object, + name: str) -> torch.Tensor: + if not isinstance(mm_input, (torch.Tensor, list)): + raise ValueError(f"Incorrect type of {name}. " + f"Got type: {type(mm_input)}") + if isinstance(mm_input, torch.Tensor): + if mm_input.ndim == 2: + return mm_input + if mm_input.ndim != 3: + raise ValueError(f"{name} should be 2D or batched 3D tensor. " + f"Got ndim: {mm_input.ndim} " + f"(shape={mm_input.shape})") + return torch.concat(list(mm_input)) + else: + return torch.concat(mm_input) + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Qwen2_5_VLImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + image_grid_thw = kwargs.pop("image_grid_thw", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + pixel_values = self._validate_and_reshape_mm_tensor( + pixel_values, "image pixel values") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") + + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of image pixel values. " + f"Got type: {type(pixel_values)}") + + return Qwen2_5_VLImagePixelInputs(type="pixel_values", + pixel_values=pixel_values, + image_grid_thw=image_grid_thw) + + if image_embeds is not None: + image_embeds = self._validate_and_reshape_mm_tensor( + image_embeds, "image embeds") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") + + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + return Qwen2_5_VLImageEmbeddingInputs( + type="image_embeds", + image_embeds=image_embeds, + image_grid_thw=image_grid_thw) + + def _parse_and_validate_video_input( + self, **kwargs: object) -> Optional[Qwen2_5_VLVideoInputs]: + pixel_values_videos = kwargs.pop("pixel_values_videos", None) + video_embeds = kwargs.pop("video_embeds", None) + video_grid_thw = kwargs.pop("video_grid_thw", None) + second_per_grid_ts = kwargs.pop("second_per_grid_ts", None) + + if pixel_values_videos is None and video_embeds is None: + return None + + if pixel_values_videos is not None: + pixel_values_videos = self._validate_and_reshape_mm_tensor( + pixel_values_videos, "video pixel values") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") + + return Qwen2_5_VLVideoPixelInputs( + type="pixel_values_videos", + pixel_values_videos=pixel_values_videos, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + ) + + if video_embeds is not None: + video_embeds = self._validate_and_reshape_mm_tensor( + video_embeds, "video embeds") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") + + if not isinstance(video_embeds, torch.Tensor): + raise ValueError("Incorrect type of video embeddings. " + f"Got type: {type(video_embeds)}") + return Qwen2_5_VLVideoEmbeddingInputs( + type="video_embeds", + video_embeds=video_embeds, + video_grid_thw=video_grid_thw) + + def _process_image_input( + self, + image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + + if image_input["type"] == "image_embeds": + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"].type(self.visual.dtype) + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + + # Split concatenated embeddings for each image item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + + return image_embeds.split(sizes.tolist()) + + def _process_video_input( + self, + video_input: Qwen2_5_VLVideoInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 + + if video_input["type"] == "video_embeds": + video_embeds = video_input["video_embeds"].type(self.visual.dtype) + else: + pixel_values_videos = video_input["pixel_values_videos"].type( + self.visual.dtype) + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) + + # Split concatenated embeddings for each video item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + + return video_embeds.split(sizes.tolist()) + + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + modalities = {} + + # Preserve the order of modalities if there are multiple of them + # from the order of kwargs. + for input_key in kwargs: + if input_key in ("pixel_values", + "image_embeds") and "images" not in modalities: + modalities["images"] = self._parse_and_validate_image_input( + **kwargs) + if input_key in ("pixel_values_videos", + "video_embeds") and "videos" not in modalities: + modalities["videos"] = self._parse_and_validate_video_input( + **kwargs) + return modalities + + def get_multimodal_embeddings( + self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]: + + modalities = self._parse_and_validate_multimodal_inputs(**kwargs) + if not modalities: + return None + + # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () + + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in modalities: + if modality == "images": + image_input = modalities["images"] + vision_embeddings = self._process_image_input(image_input) + multimodal_embeddings += vision_embeddings + if modality == "videos": + video_input = modalities["videos"] + video_embeddings = self._process_video_input(video_input) + multimodal_embeddings += video_embeddings + return multimodal_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + [self.config.image_token_id, self.config.video_token_id]) + return inputs_embeds + + def get_input_embeddings_v0( + self, + input_ids: torch.Tensor, + image_input: Optional[tuple[torch.Tensor, ...]] = None, + video_input: Optional[tuple[torch.Tensor, ...]] = None, + ) -> torch.Tensor: + + inputs_embeds = self.get_input_embeddings(input_ids) + if image_input is not None: + image_embeds = self._process_image_input(image_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + image_embeds, + placeholder_token_id=self.config.image_token_id, + ) + + if video_input is not None: + video_embeds = self._process_video_input(video_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + video_embeds, + placeholder_token_id=self.config.video_token_id, + ) + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + """Run forward pass for Qwen2.5-VL. + + Args: + input_ids: Flattened (concatenated) input_ids corresponding to a + batch. + positions: Flattened (concatenated) position ids corresponding to a + batch. + **NOTE**: If mrope is enabled (default setting for Qwen2.5-VL + opensource models), the shape will be `(3, seq_len)`, + otherwise it will be `(seq_len,). + pixel_values: Pixel values to be fed to a model. + `None` if no images are passed. + image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. + `None` if no images are passed. + pixel_values_videos: Pixel values of videos to be fed to a model. + `None` if no videos are passed. + video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. + `None` if no videos are passed. + second_per_grid_ts: Tensor `(num_videos)` of video time interval ( + in seconds) for each grid along the temporal dimension in the + 3D position IDs. `None` if no videos are passed. + """ + + if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner from + # `get_multimodal_embeddings` and `get_input_embeddings`, this + # condition is only for v0 compatibility. + elif inputs_embeds is None: + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) + + if image_input is None and video_input is None: + inputs_embeds = None + else: + if uses_mrope(self.config): + assert positions.ndim == 2 and positions.size(0) == 3, ( + "multimodal section rotary embedding requires " + f"(3, seq_len) positions, but got {positions.size()}") + inputs_embeds = self.get_input_embeddings_v0( + input_ids, + image_input=image_input, + video_input=video_input) + input_ids = None + + hidden_states = self.language_model.model( + input_ids=input_ids, + positions=positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + return self.language_model.sample(logits, sampling_metadata) + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="visual.", + tower_model="visual.merger.") diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 2b2638cf6..34ae7b8c9 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -650,8 +650,8 @@ class Qwen2VisionTransformer(nn.Module): return loaded_params -class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor], - dict[str, torch.Tensor]]): +class Qwen2VLEmbeddingItems(ModalityDataItems[dict[str, torch.Tensor], + dict[str, torch.Tensor]]): def __init__(self, data: dict, modality: str) -> None: super().__init__(data, modality) @@ -683,26 +683,26 @@ class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor], return self.data -class Qwen2ImageEmbeddingItems(Qwen2EmbeddingItems): +class Qwen2VLImageEmbeddingItems(Qwen2VLEmbeddingItems): def __init__(self, data: dict) -> None: super().__init__(data, "image") -class Qwen2VideoEmbeddingItems(Qwen2EmbeddingItems): +class Qwen2VLVideoEmbeddingItems(Qwen2VLEmbeddingItems): def __init__(self, data: dict) -> None: super().__init__(data, "video") -class Qwen2MultiModalDataParser(MultiModalDataParser): +class Qwen2VLMultiModalDataParser(MultiModalDataParser): def _parse_image_data( self, data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]], ) -> ModalityDataItems[Any, Any]: if isinstance(data, dict): - return Qwen2EmbeddingItems(data, modality="image") + return Qwen2VLEmbeddingItems(data, modality="image") return super()._parse_image_data(data) @@ -711,7 +711,7 @@ class Qwen2MultiModalDataParser(MultiModalDataParser): data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]], ) -> ModalityDataItems[Any, Any]: if isinstance(data, dict): - return Qwen2EmbeddingItems(data, modality="video") + return Qwen2VLEmbeddingItems(data, modality="video") return super()._parse_video_data(data) @@ -948,7 +948,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] ): def _get_data_parser(self) -> MultiModalDataParser: - return Qwen2MultiModalDataParser() + return Qwen2VLMultiModalDataParser() def _get_prompt_replacements( self, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 962f95f10..b6708f77d 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -172,6 +172,7 @@ _MULTIMODAL_MODELS = { "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501 "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 + "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501 "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501 "UltravoxModel": ("ultravox", "UltravoxModel"), # [Encoder-decoder] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7841fac1d..ec6d04cd4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -285,6 +285,7 @@ class GPUModelRunner: if self.model_config.uses_mrope: image_grid_thw = [] video_grid_thw = [] + second_per_grid_ts = [] for mm_input in self.requests[req_id].mm_inputs: if mm_input.get("image_grid_thw") is not None: image_grid_thw.extend( @@ -292,6 +293,9 @@ class GPUModelRunner: if mm_input.get("video_grid_thw") is not None: video_grid_thw.extend( mm_input["video_grid_thw"].tolist()) + if mm_input.get("second_per_grid_ts") is not None: + second_per_grid_ts.extend( + mm_input["second_per_grid_ts"]) hf_config = self.model_config.hf_config @@ -299,14 +303,10 @@ class GPUModelRunner: self.requests[req_id].mrope_position_delta = \ MRotaryEmbedding.get_input_positions_tensor( self.requests[req_id].prompt_token_ids, + hf_config=hf_config, image_grid_thw=image_grid_thw, video_grid_thw=video_grid_thw, - image_token_id=hf_config.image_token_id, - video_token_id=hf_config.video_token_id, - vision_start_token_id=hf_config.vision_start_token_id, - vision_end_token_id=hf_config.vision_end_token_id, - spatial_merge_size=hf_config.vision_config. - spatial_merge_size, + second_per_grid_ts=second_per_grid_ts, ) req_ids_to_add.append(req_id) diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 1c3feece9..940089310 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -386,20 +386,17 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]): "mrope embedding type requires multi-modal input mapper " "returns 'image_grid_thw' or 'video_grid_thw'.") + second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) hf_config = self.runner.model_config.hf_config token_ids = seq_data.get_token_ids() mrope_positions, mrope_position_delta = \ MRotaryEmbedding.get_input_positions( token_ids, + hf_config=hf_config, image_grid_thw=image_grid_thw, video_grid_thw=video_grid_thw, - image_token_id=hf_config.image_token_id, - video_token_id=hf_config.video_token_id, - vision_start_token_id=hf_config.vision_start_token_id, - vision_end_token_id=hf_config.vision_end_token_id, - spatial_merge_size=hf_config.vision_config. - spatial_merge_size, + second_per_grid_ts=second_per_grid_ts, context_len=computed_len, ) seq_data.mrope_position_delta = mrope_position_delta diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 0bbba55b3..12baecde6 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -702,6 +702,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): "mrope embedding type requires multi-modal input mapper " "returns 'image_grid_thw' or 'video_grid_thw'.") + second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) hf_config = self.runner.model_config.hf_config inter_data.mrope_input_positions = [None] * inter_data.n_seqs @@ -713,14 +714,10 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): mrope_input_positions, mrope_position_delta = \ MRotaryEmbedding.get_input_positions( token_ids, + hf_config=hf_config, image_grid_thw=image_grid_thw, video_grid_thw=video_grid_thw, - image_token_id=hf_config.image_token_id, - video_token_id=hf_config.video_token_id, - vision_start_token_id=hf_config.vision_start_token_id, - vision_end_token_id=hf_config.vision_end_token_id, - spatial_merge_size=hf_config.vision_config. - spatial_merge_size, + second_per_grid_ts=second_per_grid_ts, context_len=inter_data.context_lens[seq_idx], seq_len=inter_data.seq_lens[seq_idx], ) -- GitLab From 75404d041be0d6e656b59cbbea23520d47d37b66 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 6 Feb 2025 11:09:45 +0800 Subject: [PATCH 57/65] [VLM] Update compatibility with transformers 4.49 --- docs/source/models/supported_models.md | 3 +- examples/template_pixtral_hf.jinja | 38 ------------------- tests/entrypoints/test_chat_utils.py | 1 - .../vision_language/test_models.py | 4 +- .../vision_language/test_llava_next.py | 7 ++-- vllm/model_executor/models/llava.py | 33 +++++++++++----- vllm/model_executor/models/llava_next.py | 10 ++++- vllm/model_executor/models/minicpmv.py | 9 +++++ vllm/multimodal/inputs.py | 4 +- 9 files changed, 50 insertions(+), 59 deletions(-) delete mode 100644 examples/template_pixtral_hf.jinja diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 3e8b2f896..ef7e77fa3 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -883,8 +883,7 @@ For more details, please see: ::: :::{note} -The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)). -A corrected version is available at . +`mistral-community/pixtral-12b` does not support V1 yet. ::: :::{note} diff --git a/examples/template_pixtral_hf.jinja b/examples/template_pixtral_hf.jinja deleted file mode 100644 index e94661cb3..000000000 --- a/examples/template_pixtral_hf.jinja +++ /dev/null @@ -1,38 +0,0 @@ -{%- if messages[0]["role"] == "system" %} - {%- set system_message = messages[0]["content"] %} - {%- set loop_messages = messages[1:] %} -{%- else %} - {%- set loop_messages = messages %} -{%- endif %} - -{{- bos_token }} -{%- for message in loop_messages %} - {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} - {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }} - {%- endif %} - {%- if message["role"] == "user" %} - {%- if loop.last and system_message is defined %} - {{- "[INST]" + system_message + "\n" }} - {%- else %} - {{- "[INST]" }} - {%- endif %} - {%- if message["content"] is not string %} - {%- for chunk in message["content"] %} - {%- if chunk["type"] == "text" %} - {{- chunk["text"] }} - {%- elif chunk["type"] == "image" %} - {{- "[IMG]" }} - {%- else %} - {{- raise_exception("Unrecognized content type!") }} - {%- endif %} - {%- endfor %} - {%- else %} - {{- message["content"] }} - {%- endif %} - {{- "[/INST]" }} - {%- elif message["role"] == "assistant" %} - {{- message["content"] + eos_token}} - {%- else %} - {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }} - {%- endif %} -{%- endfor %} diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 737f73309..5c469007a 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -761,7 +761,6 @@ def test_resolve_content_format_hf_defined(model, expected_format): ("template_falcon.jinja", "string"), ("template_inkbot.jinja", "string"), ("template_llava.jinja", "string"), - ("template_pixtral_hf.jinja", "openai"), ("template_vlm2vec.jinja", "openai"), ("tool_chat_template_granite_20b_fc.jinja", "string"), ("tool_chat_template_hermes.jinja", "string"), diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 95505dcf5..b00ec6fa6 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -224,7 +224,7 @@ VLM_TEST_SETTINGS = { marks=[ pytest.mark.skipif( Version(TRANSFORMERS_VERSION) >= Version("4.48"), - reason="HF model is not compatible with transformers>=4.48.0", + reason="HF model is not compatible with transformers>=4.48", ) ], ), @@ -359,7 +359,7 @@ VLM_TEST_SETTINGS = { marks=[ pytest.mark.skipif( Version(TRANSFORMERS_VERSION) >= Version("4.48"), - reason="HF model is not compatible with transformers>=4.48.0", + reason="HF model is not compatible with transformers>=4.48", ) ], ), diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py index 6ba3c5403..990c6c150 100644 --- a/tests/models/embedding/vision_language/test_llava_next.py +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -4,7 +4,6 @@ from typing import List, Type import pytest import torch.nn.functional as F -import transformers from transformers import AutoModelForVision2Seq from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner @@ -57,6 +56,10 @@ def _run_test( with hf_runner(model, dtype=dtype, auto_cls=AutoModelForVision2Seq) as hf_model: + # Patch the issue where generation_config.json is missing + hf_model.processor.patch_size = \ + hf_model.model.config.vision_config.patch_size + # Patch the issue where image_token_id # exceeds the maximum allowed vocab size hf_model.model.resize_token_embeddings( @@ -88,8 +91,6 @@ def _run_test( ) -@pytest.mark.skipif(transformers.__version__ >= "4.46", - reason="Model broken with changes in transformers 4.46") @pytest.mark.core_model @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 63d308ef6..b1fee3eeb 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -293,16 +293,29 @@ class PixtralHFMultiModalProcessor( pixel_values = processed_outputs.get("pixel_values") if pixel_values is not None: - images = mm_data["images"] - assert isinstance(images, list) - - # Original output: (1, num_images, C, H, W) - # New output: (num_images, C, H, W) - assert (isinstance(pixel_values, list) and len(pixel_values) == 1) - assert (isinstance(pixel_values[0], list) - and len(pixel_values[0]) == len(images)) - - processed_outputs["pixel_values"] = pixel_values[0] + # Before/after https://github.com/huggingface/transformers/pull/35122 + if Version(TRANSFORMERS_VERSION) <= Version("4.48.2"): + images = mm_data["images"] + assert isinstance(images, list) + + # Original output: (1, num_images, C, H, W) + # New output: (num_images, C, H, W) + assert (isinstance(pixel_values, list) + and len(pixel_values) == 1) + assert (isinstance(pixel_values[0], list) + and len(pixel_values[0]) == len(images)) + + processed_outputs["pixel_values"] = pixel_values[0] + else: + # Avoid padding since we need the output for each image to be + # independent of other images for the cache to work correctly + image_sizes = processed_outputs["image_sizes"] + assert len(pixel_values) == len(image_sizes) + + processed_outputs["pixel_values"] = [ + p[:, :h, :w] + for p, (h, w) in zip(pixel_values, image_sizes) + ] return processed_outputs diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index defdeb54a..719916642 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -73,7 +73,15 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo): return self.ctx.get_hf_config(LlavaNextConfig) def get_hf_processor(self): - return self.ctx.get_hf_processor(LlavaNextProcessor) + hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor) + + # In case patch_size is omitted from `processor_config.json` + # e.g. for E5-V: https://huggingface.co/royokong/e5-v + if hf_processor.patch_size is None: + patch_size = self.get_vision_encoder_info().get_patch_size() + hf_processor.patch_size = patch_size + + return hf_processor # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L113 def get_num_image_tokens( diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 20f3a3d19..58a4448d4 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -342,6 +342,15 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo): **kwargs: object, ): hf_processor = self.ctx.get_hf_processor() + + # NumPy arrays are considered as Iterable but not Sequence in + # https://github.com/huggingface/transformers/blob/main/src/transformers/image_transforms.py#L428 + image_processor = hf_processor.image_processor # type: ignore + for attr in ("mean", "std"): + val = getattr(image_processor, attr) + if isinstance(val, np.ndarray): + setattr(image_processor, attr, val.tolist()) + return hf_processor def get_image_processor(self): diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 2f2535f36..5f9593ee8 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -141,9 +141,9 @@ Uses a list instead of a tensor if the dimensions of each element do not match. def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: """Equality check between :data:`NestedTensors` objects.""" if isinstance(a, torch.Tensor): - return isinstance(b, torch.Tensor) and bool((a == b).all().item()) + return isinstance(b, torch.Tensor) and torch.equal(a, b) elif isinstance(b, torch.Tensor): - return isinstance(a, torch.Tensor) and bool((b == a).all().item()) + return isinstance(a, torch.Tensor) and torch.equal(b, a) if isinstance(a, list): return (isinstance(b, list) -- GitLab From 5b19b93082fc5ad0ce33752d8467337cbe93de21 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Wed, 5 Feb 2025 22:15:08 -0500 Subject: [PATCH 58/65] [ROCm][Kernel] Using the correct warp_size value --- csrc/moe/moe_align_sum_kernels.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index ff74a42d7..01dac4044 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -207,8 +207,8 @@ __global__ void sgl_moe_align_block_size_kernel( __shared__ int32_t shared_counts[32][8]; __shared__ int32_t local_offsets[256]; - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane_id = threadIdx.x % WARP_SIZE; + const int warp_id = threadIdx.x / 32; + const int lane_id = threadIdx.x % 32; const int experts_per_warp = 8; const int my_expert_start = warp_id * experts_per_warp; -- GitLab From 76abd0c88143419826bfc13d2cd29669d0fdfa1b Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 5 Feb 2025 22:22:19 -0500 Subject: [PATCH 59/65] [Bugfix] Better FP8 supported defaults --- .../layers/quantization/utils/fp8_utils.py | 28 +++++++++++-------- .../layers/quantization/utils/w8a8_utils.py | 6 +++- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 10ff71e57..99fbda314 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -15,7 +15,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( _normalize_quant_group_shape, scaled_dequantize) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - apply_fp8_linear) + CUTLASS_BLOCK_FP8_SUPPORTED, CUTLASS_FP8_SUPPORTED, apply_fp8_linear) from vllm.platforms import current_platform logger = init_logger(__name__) @@ -38,7 +38,7 @@ def apply_w8a8_block_fp8_linear( weight_scale: torch.Tensor, input_scale: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, - cutlass_block_fp8_supported: bool = True, + cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED, ) -> torch.Tensor: assert input_scale is None # View input as 2D matrix for fp8 methods @@ -85,12 +85,14 @@ def apply_w8a8_block_fp8_linear( # `apply_fp8_linear` # NOTE(lucas): this is quite messy, we should think through this more formally def apply_fp8_linear_generic( - input: torch.Tensor, - weight: torch.Tensor, - weight_scale: torch.Tensor, - input_group_shape: Tuple[int, int], - weight_group_shape: Tuple[int, int], - input_scale: Optional[torch.Tensor] = None, # static scale if one + input: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + input_group_shape: Tuple[int, int], + weight_group_shape: Tuple[int, int], + input_scale: Optional[torch.Tensor] = None, # static scale if one + cutlass_fp8_supported: bool = CUTLASS_FP8_SUPPORTED, + cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED, ) -> torch.Tensor: # View input as 2D matrix for fp8 methods input = input.view(-1, input.shape[-1]) @@ -105,14 +107,18 @@ def apply_fp8_linear_generic( if is_dim_blocked(0, weight.shape, weight_group_shape[0])\ and is_dim_blocked(1, weight.shape, weight_group_shape[1]) and\ input_group_shape == (1, weight_group_shape[1]): - return apply_w8a8_block_fp8_linear(input, weight, - list(weight_group_shape), - weight_scale) + return apply_w8a8_block_fp8_linear( + input, + weight, + list(weight_group_shape), + weight_scale, + cutlass_block_fp8_supported=cutlass_block_fp8_supported) else: # Despite having linear in the it doesn't conform to # `torch.nn.functional.linear` which is defined as `input @ weight.T` # so we explicitly transpose the weight matrix here return apply_fp8_linear(input, weight.T, weight_scale.T, + cutlass_fp8_supported=cutlass_fp8_supported, use_per_token_if_dynamic=\ (input_group_shape == (1, input.shape[1]))) diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 3fd88e875..dedeb0c29 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -42,6 +42,10 @@ def cutlass_block_fp8_supported() -> bool: return ops.cutlass_scaled_mm_supports_block_fp8(capability) +CUTLASS_FP8_SUPPORTED = cutlass_fp8_supported() +CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported() + + def per_tensor_dequantize( tensor: torch.Tensor, inv_scale: Union[float, torch.Tensor]) -> torch.Tensor: @@ -109,7 +113,7 @@ def apply_fp8_linear( input_scale: Optional[torch.Tensor] = None, input_scale_ub: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, - cutlass_fp8_supported: bool = True, + cutlass_fp8_supported: bool = CUTLASS_FP8_SUPPORTED, use_per_token_if_dynamic: bool = False, ) -> torch.Tensor: # ops.scaled_fp8_quant supports both dynamic and static quant. -- GitLab From 9cdea30b4fe0ebd23847371f51ea0a48b9615847 Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Wed, 5 Feb 2025 19:23:35 -0800 Subject: [PATCH 60/65] [Misc][Easy] Remove the space from the file name --- ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 vllm/model_executor/layers/fused_moe/fused_moe.py | 2 +- ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 vllm/model_executor/layers/quantization/utils/fp8_utils.py | 2 +- 42 files changed, 2 insertions(+), 2 deletions(-) rename vllm/model_executor/layers/fused_moe/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/fused_moe/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 1bed35525..f14200e02 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -765,7 +765,7 @@ def get_config_file_name(E: int, device_name = current_platform.get_device_name().replace(" ", "_") dtype_selector = "" if not dtype else f",dtype={dtype}" block_shape_selector = ("" if not block_shape or not all(block_shape) else - f",block_shape={block_shape}") + f",block_shape={block_shape}").replace(" ", "") return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json" # noqa: E501 diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 99fbda314..9895537c2 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -423,7 +423,7 @@ def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int, # First look up if an optimized configuration is available in the configs # directory device_name = current_platform.get_device_name().replace(" ", "_") - json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n}, {block_k}].json" # noqa: E501 + json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n},{block_k}].json" # noqa: E501 config_file_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name) -- GitLab From d88506dda45f69cf1f56c4325282c2a881eaaaf7 Mon Sep 17 00:00:00 2001 From: Sumit Vij Date: Wed, 5 Feb 2025 19:54:13 -0800 Subject: [PATCH 61/65] [Model] LoRA Support for Ultravox model (#11253) --- docs/source/models/supported_models.md | 2 +- tests/conftest.py | 16 +++- tests/lora/test_ultravox.py | 121 +++++++++++++++++++++++++ vllm/model_executor/models/ultravox.py | 28 +++++- 4 files changed, 160 insertions(+), 7 deletions(-) create mode 100644 tests/lora/test_ultravox.py diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index ef7e77fa3..32f3e9def 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -857,7 +857,7 @@ See [this page](#generative-models) for more information on how to use generativ * Ultravox * T + AE+ * `fixie-ai/ultravox-v0_3` - * + * ✅︎ * ✅︎ * ✅︎ ::: diff --git a/tests/conftest.py b/tests/conftest.py index 85dd5bcb0..02105900f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -737,6 +737,7 @@ class VllmRunner: images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, + **kwargs: Any, ) -> List[Tuple[List[List[int]], List[str]]]: inputs = self.get_inputs(prompts, images=images, @@ -744,7 +745,8 @@ class VllmRunner: audios=audios) req_outputs = self.model.generate(inputs, - sampling_params=sampling_params) + sampling_params=sampling_params, + **kwargs) outputs: List[Tuple[List[List[int]], List[str]]] = [] for req_output in req_outputs: @@ -782,6 +784,7 @@ class VllmRunner: images: Optional[PromptImageInput] = None, audios: Optional[PromptAudioInput] = None, videos: Optional[PromptVideoInput] = None, + **kwargs: Any, ) -> Union[List[TokensTextLogprobs], List[TokensTextLogprobsPromptLogprobs]]: inputs = self.get_inputs(prompts, @@ -790,7 +793,8 @@ class VllmRunner: audios=audios) req_outputs = self.model.generate(inputs, - sampling_params=sampling_params) + sampling_params=sampling_params, + **kwargs) toks_str_logsprobs_prompt_logprobs = ( self._final_steps_generate_w_logprobs(req_outputs)) @@ -826,13 +830,15 @@ class VllmRunner: images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, + **kwargs: Any, ) -> List[Tuple[List[int], str]]: greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) outputs = self.generate(prompts, greedy_params, images=images, videos=videos, - audios=audios) + audios=audios, + **kwargs) return [(output_ids[0], output_str[0]) for output_ids, output_str in outputs] @@ -847,6 +853,7 @@ class VllmRunner: videos: Optional[PromptVideoInput] = None, stop_token_ids: Optional[List[int]] = None, stop: Optional[List[str]] = None, + **kwargs: Any, ) -> Union[List[TokensTextLogprobs], List[TokensTextLogprobsPromptLogprobs]]: greedy_logprobs_params = SamplingParams( @@ -861,7 +868,8 @@ class VllmRunner: greedy_logprobs_params, images=images, audios=audios, - videos=videos) + videos=videos, + **kwargs) def generate_encoder_decoder_greedy_logprobs( self, diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py new file mode 100644 index 000000000..1218dfa34 --- /dev/null +++ b/tests/lora/test_ultravox.py @@ -0,0 +1,121 @@ +import shutil +from os import path +from tempfile import TemporaryDirectory +from typing import List, Tuple + +import torch +from huggingface_hub import snapshot_download +from safetensors.torch import load_file, save_file +from transformers import AutoTokenizer + +from vllm.lora.request import LoRARequest + +from ..models.utils import check_outputs_equal + +ULTRAVOX_MODEL_NAME = "fixie-ai/ultravox-v0_3" +LLMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" + +VLLM_PLACEHOLDER = "<|reserved_special_token_0|>" + +PROMPT = "Tell me about a Fool's mate move in 20 words. Provide the moves!" + + +def llama3_1_8b_chess_lora_path(): + return snapshot_download( + repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b") + + +# can't use llama lora adapter without module name transformation +# because ultravox nest language model +def transform_module_names_for_ultravox(state_dict): + transformed_state_dict = {} + for key, value in state_dict.items(): + new_key = key.replace("base_model.model", + "base_model.model.language_model") + transformed_state_dict[new_key] = value + return transformed_state_dict + + +def mk_llama3_1_8b_ultravox_chess_lora(source_repo, target_path): + tensor_file = "adapter_model.safetensors" + state_dict = load_file(path.join(source_repo, tensor_file)) + transformed_state_dict = transform_module_names_for_ultravox(state_dict) + + save_file(transformed_state_dict, path.join(target_path, tensor_file)) + + config_file = "adapter_config.json" + shutil.copyfile(path.join(source_repo, config_file), + path.join(target_path, config_file)) + return target_path + + +def _get_prompt(audio_count, question, placeholder, model_name) -> str: + tokenizer = AutoTokenizer.from_pretrained(model_name) + placeholder = f"{placeholder}\n" * audio_count + + return tokenizer.apply_chat_template([{ + 'role': 'user', + 'content': f"{placeholder}{question}" + }], + tokenize=False, + add_generation_prompt=True) + + +def test_ultravox_lora(vllm_runner): + """ + TODO: Train an Ultravox LoRA instead of using a Llama LoRA. + """ + # Workaround to prevent device mismatch in Whisper. + # Can be removed when it is fixed upstream in transformer + # https://github.com/huggingface/transformers/pull/35866 + torch.set_default_device("cpu") + + llama3_1_8b_chess_lora = llama3_1_8b_chess_lora_path() + with TemporaryDirectory() as temp_ultravox_lora_dir: + llama3_1_8b_ultravox_chess_lora = mk_llama3_1_8b_ultravox_chess_lora( + llama3_1_8b_chess_lora, temp_ultravox_lora_dir) + with vllm_runner( + ULTRAVOX_MODEL_NAME, + enforce_eager=True, + max_num_seqs=2, + enable_lora=True, + max_loras=1, + max_lora_rank=128, + dtype="bfloat16", + max_model_len=1024, + ) as vllm_model: + ultravox_outputs: List[Tuple[ + List[int], str]] = vllm_model.generate_greedy( + [ + _get_prompt(0, PROMPT, VLLM_PLACEHOLDER, + ULTRAVOX_MODEL_NAME) + ], + 256, + lora_request=LoRARequest(str(1), 1, + llama3_1_8b_ultravox_chess_lora), + ) + + # run llama with and without lora to compare outputs with above + with vllm_runner( + LLMA_MODEL_NAME, + enforce_eager=True, + max_num_seqs=2, + enable_lora=True, + max_loras=1, + max_lora_rank=128, + dtype="bfloat16", + max_model_len=1024, + ) as vllm_model: + llama_outputs: List[Tuple[List[int], str]] = ( + vllm_model.generate_greedy( + [_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)], + 256, + lora_request=LoRARequest(str(1), 1, llama3_1_8b_chess_lora), + )) + + check_outputs_equal( + outputs_0_lst=ultravox_outputs, + outputs_1_lst=llama_outputs, + name_0="ultravox", + name_1="llama", + ) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 52a4d798f..9da0682cf 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -22,6 +22,7 @@ from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.loader import DefaultModelLoader +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, @@ -33,7 +34,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.ultravox import UltravoxConfig -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings, @@ -343,7 +344,20 @@ class ModifiedWhisperEncoder(WhisperEncoder): UltravoxMultiModalProcessor, info=UltravoxProcessingInfo, dummy_inputs=UltravoxDummyInputsBuilder) -class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): +class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): + + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] + } + + # LoRA specific attributes + # TODO : Add LoRA to the audio tower and projector. + supported_lora_modules = [ + "qkv_proj", "o_proj", "gate_up_proj", "down_proj" + ] + embedding_modules = {} + embedding_padding_modules = [] hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."}) @@ -391,6 +405,16 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): return get_sampler() + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model.", + connector="multi_modal_projector.", + tower_model="audio_tower.", + ) + def _audio_features_to_embeddings( self, input_features: torch.Tensor) -> torch.Tensor: audio_input = input_features.to(self.audio_tower.dtype) -- GitLab From 56534cd577211c563b2c5b74098929b949fc4063 Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Wed, 5 Feb 2025 21:25:54 -0800 Subject: [PATCH 62/65] [Bugfix] Fix the test_ultravox.py's license (#12806) Signed-off-by: Lu Fang --- tests/lora/test_ultravox.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py index 1218dfa34..703f92ce8 100644 --- a/tests/lora/test_ultravox.py +++ b/tests/lora/test_ultravox.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import shutil from os import path from tempfile import TemporaryDirectory -- GitLab From 1a6fcad4c933c89b4060a37d807a7f9e5a680cf3 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 6 Feb 2025 06:24:57 +0000 Subject: [PATCH 63/65] Improve `TransformersModel` UX (#12785) --- vllm/model_executor/models/transformers.py | 53 +++++++++++++--------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index dfc714382..43d2c88d3 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -15,7 +15,7 @@ # limitations under the License. """Wrapper around `transformers` models""" import re -from typing import Iterable, Optional, Union +from typing import Iterable, Literal, Optional, Union import torch from torch import nn @@ -72,15 +72,24 @@ def vllm_flash_attention_forward( ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward +def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module): + logger.debug("%s: %s -> %s", name, old_module, new_module) + + def replace_linear_class( linear: nn.Linear, - style: str, + style: Literal["colwise", "rowwise"], quant_config=None) -> Union[ColumnParallelLinear, RowParallelLinear]: """ - In model configurations, we use a neutral type (string) to specify parallel - styles, here we use it to translate nn.Linear into vllm-style tp Linear. - - Quant config is not supported yet + Replace nn.Linear with one of vLLM's tensor parallel linear classes. + + `quant_config` is not yet supported. + Args: + linear (nn.Linear): `nn.Linear` to be replaced. + style (str): Tensor parallel style of the new linear, e.g. "colwise". + quant_config (QuantConfig): Quantization config for the new linear. + Returns: + Union[ColumnParallelLinear, RowParallelLinear]: The new linear. """ if not isinstance(style, str): @@ -93,7 +102,10 @@ def replace_linear_class( }.get(style) if vllm_linear_cls is None: - raise ValueError(f"Unsupported parallel style value: {style}") + logger.warning( + "Unsupported parallel style value: %s. " + "This layer will not be tensor parallelized.", style) + return linear class HFCompatibleLinear(vllm_linear_cls): """ @@ -119,25 +131,24 @@ class TransformersModel(nn.Module): super().__init__() logger.info("Using Transformers backend.") - self.vllm_config = vllm_config config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - self.quant_config = quant_config + self.config = config + self.quant_config = quant_config self.vocab_size = config.vocab_size self.unpadded_vocab_size = config.vocab_size self.model: PreTrainedModel = AutoModel.from_config( self.config, attn_implementation="vllm", - torch_dtype=vllm_config.model_config.dtype, trust_remote_code=vllm_config.model_config.trust_remote_code, ) prefix = self.model.base_model_prefix # MLP modifications - self.tensor_parallelize(self.model) + self.apply_base_model_tp_plan(self.model) # Attention modifications (assumes 1 attention op per hidden layer) tp_size = get_tensor_model_parallel_world_size() @@ -170,13 +181,13 @@ class TransformersModel(nn.Module): config.vocab_size, logit_scale) self.sampler = get_sampler() - def log_replacement(self, name: str, old_module: nn.Module, - new_module: nn.Module): - logger.debug("%s: %s -> %s", name, old_module, new_module) - - def tensor_parallelize(self, module: nn.Module, prefix: str = ""): + def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""): + """ + Apply the base model tensor parallelization plan to a module. + Currently only supports linear layers. + """ if (self.config.base_model_tp_plan is None - and self.vllm_config.parallel_config.tensor_parallel_size > 1): + and get_tensor_model_parallel_world_size() > 1): raise ValueError( "Trying to run tensor parallelization but the model does not " "support it yet!") @@ -189,9 +200,9 @@ class TransformersModel(nn.Module): new_module = replace_linear_class(child_module, style, self.quant_config) setattr(module, child_name, new_module) - self.log_replacement(qual_name, child_module, new_module) + log_replacement(qual_name, child_module, new_module) else: - self.tensor_parallelize(child_module, prefix=qual_name) + self.apply_base_model_tp_plan(child_module, prefix=qual_name) def replace_vocab_embed_class(self, module: nn.Module): # Use native set input embeddings @@ -201,8 +212,8 @@ class TransformersModel(nn.Module): org_num_embeddings=self.config.vocab_size, quant_config=None, ) - self.log_replacement("input embedding", - self.model.get_input_embeddings(), new_module) + log_replacement("input embedding", self.model.get_input_embeddings(), + new_module) self.model.set_input_embeddings(new_module) def forward( -- GitLab From 449d1bce029f87e0d1cf3f30483687ff659268f2 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 6 Feb 2025 02:16:20 -0500 Subject: [PATCH 64/65] [Misc] Remove duplicated DeepSeek V2/V3 model definition (#12793) --- vllm/config.py | 1 - vllm/model_executor/models/deepseek_v2.py | 48 +- vllm/model_executor/models/deepseek_v3.py | 806 ---------------------- vllm/model_executor/models/registry.py | 2 +- 4 files changed, 36 insertions(+), 821 deletions(-) delete mode 100644 vllm/model_executor/models/deepseek_v3.py diff --git a/vllm/config.py b/vllm/config.py index bc4bf627b..9ba497576 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -754,7 +754,6 @@ class ModelConfig: @property def is_deepseek_mla(self) -> bool: - # TODO add deepseek_v3 return (hasattr(self.hf_text_config, "model_type")) \ and (self.hf_text_config.model_type in \ ('deepseek_v2', 'deepseek_v3'))\ diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index fdd584f9d..773f5abe7 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -21,7 +21,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Inference-only DeepseekV2 model.""" +"""Inference-only DeepseekV2/DeepseekV3 model.""" from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch @@ -115,23 +115,32 @@ class DeepseekV2MoE(nn.Module): raise ValueError(f"Unsupported activation: {config.hidden_act}. " "Only silu is supported for now.") - self.experts = FusedMoE(num_experts=config.n_routed_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, - reduce_results=False, - renormalize=config.norm_topk_prob, - quant_config=quant_config, - use_grouped_topk=True, - num_expert_group=config.n_group, - topk_group=config.topk_group, - prefix=f"{prefix}.experts") - self.gate = ReplicatedLinear(config.hidden_size, config.n_routed_experts, bias=False, quant_config=None, prefix=f"{prefix}.gate") + if config.topk_method == "noaux_tc": + self.gate.e_score_correction_bias = nn.Parameter( + torch.empty(config.n_routed_experts)) + else: + self.gate.e_score_correction_bias = None + + self.experts = FusedMoE( + num_experts=config.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + use_grouped_topk=True, + num_expert_group=config.n_group, + topk_group=config.topk_group, + prefix=f"{prefix}.experts", + scoring_func=config.scoring_func, + e_score_correction_bias=self.gate.e_score_correction_bias) + if config.n_shared_experts is not None: intermediate_size = (config.moe_intermediate_size * config.n_shared_experts) @@ -732,6 +741,15 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue + + # TODO(simon): support nextn predict layers + if hasattr(self.config, "num_nextn_predict_layers" + ) and self.config.num_nextn_predict_layers > 0: + assert self.config.num_nextn_predict_layers == 1 + layer_idx = self.config.num_hidden_layers + if name.startswith(f"model.layers.{layer_idx}"): + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). if weight_name not in name: @@ -793,3 +811,7 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP): weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params + + +class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM): + pass diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py deleted file mode 100644 index 81f82b182..000000000 --- a/vllm/model_executor/models/deepseek_v3.py +++ /dev/null @@ -1,806 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py -# Copyright 2023 The vLLM team. -# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only DeepseekV3 model.""" -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union - -import torch -from torch import nn -from transformers import PretrainedConfig - -from vllm.attention import Attention, AttentionMetadata -from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, ModelConfig, VllmConfig -from vllm.distributed import (get_pp_group, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce) -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import FusedMoE -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, - ReplicatedLinear, - RowParallelLinear) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors - -from .interfaces import SupportsPP -from .utils import (PPMissingLayer, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix) - - -class DeepseekV3MLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - quant_config: Optional[QuantizationConfig] = None, - reduce_results: bool = True, - prefix: str = "", - ) -> None: - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.gate_up_proj") - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - quant_config=quant_config, - reduce_results=reduce_results, - prefix=f"{prefix}.down_proj") - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x - - -class DeepseekV3MoE(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.tp_size = get_tensor_model_parallel_world_size() - self.routed_scaling_factor = config.routed_scaling_factor - self.n_shared_experts = config.n_shared_experts - self.routed_scaling_factor = config.routed_scaling_factor - if self.tp_size > config.n_routed_experts: - raise ValueError( - f"Tensor parallel size {self.tp_size} is greater than " - f"the number of experts {config.n_routed_experts}.") - - if config.hidden_act != "silu": - raise ValueError(f"Unsupported activation: {config.hidden_act}. " - "Only silu is supported for now.") - - self.gate = ReplicatedLinear(config.hidden_size, - config.n_routed_experts, - bias=False, - quant_config=None, - prefix=f"{prefix}.gate") - if config.topk_method == "noaux_tc": - self.gate.e_score_correction_bias = nn.Parameter( - torch.empty(config.n_routed_experts)) - else: - self.gate.e_score_correction_bias = None - - self.experts = FusedMoE( - num_experts=config.n_routed_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, - reduce_results=False, - renormalize=config.norm_topk_prob, - quant_config=quant_config, - use_grouped_topk=True, - num_expert_group=config.n_group, - topk_group=config.topk_group, - prefix=f"{prefix}.experts", - scoring_func=config.scoring_func, - e_score_correction_bias=self.gate.e_score_correction_bias) - - if config.n_shared_experts is not None: - intermediate_size = (config.moe_intermediate_size * - config.n_shared_experts) - self.shared_experts = DeepseekV3MLP( - hidden_size=config.hidden_size, - intermediate_size=intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - reduce_results=False, - ) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - num_tokens, hidden_dim = hidden_states.shape - hidden_states = hidden_states.view(-1, hidden_dim) - if self.n_shared_experts is not None: - shared_output = self.shared_experts(hidden_states) - # router_logits: (num_tokens, n_experts) - router_logits, _ = self.gate(hidden_states) - final_hidden_states = self.experts( - hidden_states=hidden_states, - router_logits=router_logits) * self.routed_scaling_factor - if shared_output is not None: - final_hidden_states = final_hidden_states + shared_output - if self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce( - final_hidden_states) - - return final_hidden_states.view(num_tokens, hidden_dim) - - -def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: - import math - if scale <= 1: - return 1.0 - return 0.1 * mscale * math.log(scale) + 1.0 - - -class DeepseekV3Attention(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - hidden_size: int, - num_heads: int, - qk_nope_head_dim: int, - qk_rope_head_dim: int, - v_head_dim: int, - q_lora_rank: int, - kv_lora_rank: int, - rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, - max_position_embeddings: int = 8192, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__() - self.hidden_size = hidden_size - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_rope_head_dim = qk_rope_head_dim - self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim - self.v_head_dim = v_head_dim - self.q_lora_rank = q_lora_rank - self.kv_lora_rank = kv_lora_rank - self.num_heads = num_heads - tp_size = get_tensor_model_parallel_world_size() - assert num_heads % tp_size == 0 - self.num_local_heads = num_heads // tp_size - self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta - self.max_position_embeddings = max_position_embeddings - - if self.q_lora_rank is not None: - self.q_a_proj = ReplicatedLinear(self.hidden_size, - self.q_lora_rank, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.q_a_proj") - self.q_a_layernorm = RMSNorm(self.q_lora_rank, - eps=config.rms_norm_eps) - self.q_b_proj = ColumnParallelLinear(q_lora_rank, - self.num_heads * - self.qk_head_dim, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.q_b_proj") - else: - self.q_proj = ColumnParallelLinear(self.hidden_size, - self.num_heads * - self.qk_head_dim, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.q_proj") - - self.kv_a_proj_with_mqa = ReplicatedLinear( - self.hidden_size, - self.kv_lora_rank + self.qk_rope_head_dim, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.kv_a_proj_with_mqa") - self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, - eps=config.rms_norm_eps) - self.kv_b_proj = ColumnParallelLinear( - self.kv_lora_rank, - self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.kv_b_proj") - # O projection. - self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim, - self.hidden_size, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.o_proj") - if rope_scaling: - rope_scaling["rope_type"] = 'deepseek_yarn' - self.use_normal_rope = False - else: - self.use_normal_rope = True - self.rotary_emb = get_rope(qk_rope_head_dim, - rotary_dim=qk_rope_head_dim, - max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, - is_neox_style=False) - - if rope_scaling: - mscale_all_dim = rope_scaling.get("mscale_all_dim", False) - scaling_factor = rope_scaling["factor"] - mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) - self.scaling = self.scaling * mscale * mscale - - self.attn = Attention(self.num_local_heads, - self.qk_head_dim, - self.scaling, - num_kv_heads=self.num_local_heads, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn") - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: AttentionMetadata, - ) -> torch.Tensor: - if self.q_lora_rank is not None: - q = self.q_a_proj(hidden_states)[0] - q = self.q_a_layernorm(q) - q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, - self.qk_head_dim) - else: - q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads, - self.qk_head_dim) - q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], - dim=-1) - latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0] - kv_a, _ = latent_cache.split( - [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - latent_cache = latent_cache.unsqueeze(1) - kv_a = self.kv_a_layernorm(kv_a.contiguous()) - kv = self.kv_b_proj(kv_a)[0] - kv = kv.view(-1, self.num_local_heads, - self.qk_nope_head_dim + self.v_head_dim) - k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) - k_pe = latent_cache[:, :, self.kv_lora_rank:] - - if self.use_normal_rope: - seq_len = positions.size(0) - ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape - q_pe = q_pe.reshape(seq_len, -1) - k_pe = k_pe.reshape(seq_len, -1) - - q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe) - - if self.use_normal_rope: - q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape) - - q[..., self.qk_nope_head_dim:] = q_pe - k = torch.empty_like(q) - k[..., :self.qk_nope_head_dim] = k_nope - k[..., self.qk_nope_head_dim:] = k_pe - # padding value to qk_head_dim for alignment - v = torch.nn.functional.pad( - v, [0, self.qk_head_dim - self.v_head_dim], - value=0).view(-1, self.num_local_heads * self.qk_head_dim) - attn_output = self.attn(q, k, v, kv_cache, attn_metadata) - attn_output = attn_output.view( - -1, self.num_local_heads, - self.qk_head_dim)[..., :self.v_head_dim].reshape( - -1, self.num_local_heads * self.v_head_dim) - output, _ = self.o_proj(attn_output) - return output - - -class DeepseekV3MLAAttention(nn.Module): - """ - Main reference: DeepseekV2 paper, and FlashInfer Implementation - (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551). - - For more info see MLACommonImpl in: vllm/attention/backends/mla/utils.py - """ - - def __init__( - self, - config: PretrainedConfig, - hidden_size: int, - num_heads: int, - qk_nope_head_dim: int, - qk_rope_head_dim: int, - v_head_dim: int, - q_lora_rank: Optional[int], - kv_lora_rank: int, - rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, - max_position_embeddings: int = 8192, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__() - self.hidden_size = hidden_size - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_rope_head_dim = qk_rope_head_dim - self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim - self.v_head_dim = v_head_dim - - self.q_lora_rank = q_lora_rank - self.kv_lora_rank = kv_lora_rank - - self.num_heads = num_heads - tp_size = get_tensor_model_parallel_world_size() - assert num_heads % tp_size == 0 - self.num_local_heads = num_heads // tp_size - - self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta - self.max_position_embeddings = max_position_embeddings - - if self.q_lora_rank is not None: - self.q_a_proj = ReplicatedLinear(self.hidden_size, - self.q_lora_rank, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.q_a_proj") - self.q_a_layernorm = RMSNorm(self.q_lora_rank, - eps=config.rms_norm_eps) - self.q_b_proj = ColumnParallelLinear(q_lora_rank, - self.num_heads * - self.qk_head_dim, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.q_b_proj") - else: - self.q_proj = ColumnParallelLinear(self.hidden_size, - self.num_heads * - self.qk_head_dim, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.q_proj") - - self.kv_a_proj_with_mqa = ReplicatedLinear( - self.hidden_size, - self.kv_lora_rank + self.qk_rope_head_dim, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.kv_a_proj_with_mqa") - self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, - eps=config.rms_norm_eps) - self.kv_b_proj = ColumnParallelLinear( - self.kv_lora_rank, - self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.kv_b_proj") - self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim, - self.hidden_size, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.o_proj") - - if rope_scaling: - rope_scaling["rope_type"] = 'deepseek_yarn' - self.rotary_emb = get_rope(qk_rope_head_dim, - rotary_dim=qk_rope_head_dim, - max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, - is_neox_style=False) - if rope_scaling: - mscale_all_dim = rope_scaling.get("mscale_all_dim", False) - scaling_factor = rope_scaling["factor"] - mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) - self.scaling = self.scaling * mscale * mscale - - self.mla_attn = Attention( - num_heads=self.num_local_heads, - head_size=self.kv_lora_rank, - scale=self.scaling, - num_kv_heads=1, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - use_mla=True, - # MLA Args - q_lora_rank=self.q_lora_rank, - kv_lora_rank=self.kv_lora_rank, - qk_nope_head_dim=self.qk_nope_head_dim, - qk_rope_head_dim=self.qk_rope_head_dim, - qk_head_dim=self.qk_head_dim, - v_head_dim=self.v_head_dim, - rotary_emb=self.rotary_emb, - q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj, - kv_b_proj=self.kv_b_proj, - o_proj=self.o_proj, - ) - - self.prefix = prefix - self.debug_layer_idx = int(self.prefix.split(".")[-2]) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: AttentionMetadata, - ) -> torch.Tensor: - if self.q_lora_rank is not None: - ckq = self.q_a_proj(hidden_states)[0] - hidden_states_or_q_c = self.q_a_layernorm(ckq) - else: - hidden_states_or_q_c = hidden_states - kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split( - [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) - return self.mla_attn(hidden_states_or_q_c, kv_c_normed, k_pe, kv_cache, - attn_metadata) - - -class DeepseekV3DecoderLayer(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - prefix: str, - model_config: ModelConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - max_position_embeddings = getattr(config, "max_position_embeddings", - 8192) - # DecoderLayers are created with `make_layers` which passes the prefix - # with the layer's index. - layer_idx = int(prefix.split(sep='.')[-1]) - if model_config.use_mla: - attn_cls = DeepseekV3MLAAttention - else: - attn_cls = DeepseekV3Attention - self.self_attn = attn_cls( - config=config, - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - qk_nope_head_dim=config.qk_nope_head_dim, - qk_rope_head_dim=config.qk_rope_head_dim, - v_head_dim=config.v_head_dim, - q_lora_rank=config.q_lora_rank - if hasattr(config, "q_lora_rank") else None, - kv_lora_rank=config.kv_lora_rank, - rope_theta=rope_theta, - rope_scaling=rope_scaling, - max_position_embeddings=max_position_embeddings, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn", - ) - if (config.n_routed_experts is not None - and layer_idx >= config.first_k_dense_replace - and layer_idx % config.moe_layer_freq == 0): - self.mlp = DeepseekV3MoE( - config=config, - quant_config=quant_config, - prefix=f"{prefix}.mlp", - ) - else: - self.mlp = DeepseekV3MLP( - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - prefix=f"{prefix}.mlp", - ) - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: AttentionMetadata, - residual: Optional[torch.Tensor], - ) -> torch.Tensor: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - attn_metadata=attn_metadata, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) - hidden_states = self.mlp(hidden_states) - return hidden_states, residual - - -@support_torch_compile -class DeepseekV3Model(nn.Module): - - fall_back_to_pt_during_load = False - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - model_config = vllm_config.model_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - if get_pp_group().is_first_rank: - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - ) - else: - self.embed_tokens = PPMissingLayer() - - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda prefix: DeepseekV3DecoderLayer( - config, - prefix, - model_config=model_config, - cache_config=cache_config, - quant_config=quant_config, - ), - prefix=f"{prefix}.layers") - - if get_pp_group().is_last_rank: - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - else: - self.norm = PPMissingLayer() - self.make_empty_intermediate_tensors = ( - make_empty_intermediate_tensors_factory( - ["hidden_states", "residual"], config.hidden_size)) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.embed_tokens(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[IntermediateTensors], - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - if get_pp_group().is_first_rank: - if inputs_embeds is not None: - hidden_states = inputs_embeds - else: - hidden_states = self.get_input_embeddings(input_ids) - residual = None - else: - assert intermediate_tensors is not None - hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] - - for i in range(self.start_layer, self.end_layer): - layer = self.layers[i] - hidden_states, residual = layer(positions, hidden_states, - kv_caches[i - self.start_layer], - attn_metadata, residual) - - if not get_pp_group().is_last_rank: - return IntermediateTensors({ - "hidden_states": hidden_states, - "residual": residual - }) - - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - -class DeepseekV3ForCausalLM(nn.Module, SupportsPP): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - self.quant_config = quant_config - self.model = DeepseekV3Model(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config) - self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.get_input_embeddings(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors, - inputs_embeds) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - - def make_empty_intermediate_tensors( - self, batch_size: int, dtype: torch.dtype, - device: torch.device) -> IntermediateTensors: - return IntermediateTensors({ - "hidden_states": - torch.zeros((batch_size, self.config.hidden_size), - dtype=dtype, - device=device), - "residual": - torch.zeros((batch_size, self.config.hidden_size), - dtype=dtype, - device=device), - }) - - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - - # Params for weights, fp8 weight scales, fp8 activation scales - # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = FusedMoE.make_expert_params_mapping( - ckpt_gate_proj_name="gate_proj", - ckpt_down_proj_name="down_proj", - ckpt_up_proj_name="up_proj", - num_experts=self.config.n_routed_experts) - - params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - - # TODO(simon): support nextn predict layers - if hasattr(self.config, "num_nextn_predict_layers" - ) and self.config.num_nextn_predict_layers > 0: - assert self.config.num_nextn_predict_layers == 1 - layer_idx = self.config.num_hidden_layers - if name.startswith(f"model.layers.{layer_idx}"): - continue - - for (param_name, weight_name, shard_id) in stacked_params_mapping: - # Skip non-stacked layers and experts (experts handled below). - if weight_name not in name: - continue - # We have mlp.experts[0].gate_proj in the checkpoint. - # Since we handle the experts below in expert_params_mapping, - # we need to skip here BEFORE we update the name, otherwise - # name will be updated to mlp.experts[0].gate_up_proj, which - # will then be updated below in expert_params_mapping - # for mlp.experts[0].gate_gate_up_proj, which breaks load. - if (("mlp.experts." in name) and name not in params_dict): - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - for mapping in expert_params_mapping: - param_name, weight_name, expert_id, shard_id = mapping - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, - loaded_weight, - name, - shard_id=shard_id, - expert_id=expert_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index b6708f77d..3b2a7069e 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -45,7 +45,7 @@ _TEXT_GENERATION_MODELS = { "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"), "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"), - "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"), + "DeepseekV3ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"), "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"), "FalconForCausalLM": ("falcon", "FalconForCausalLM"), "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"), -- GitLab From 0408efc6d0c17fba17b2be38d0d0f02e96d2bf9d Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 6 Feb 2025 15:23:50 +0800 Subject: [PATCH 65/65] [Misc] Improve error message for incorrect pynvml (#12809) Signed-off-by: youkaichao --- vllm/platforms/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 9c98942b5..e4767a378 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -41,7 +41,11 @@ def cuda_platform_plugin() -> Optional[str]: is_cuda = True finally: pynvml.nvmlShutdown() - except Exception: + except Exception as e: + if "nvml" not in e.__class__.__name__.lower(): + # If the error is not related to NVML, re-raise it. + raise e + # CUDA is supported on Jetson, but NVML may not be. import os -- GitLab